From aff7f03218b580f8f7f5f7292633082883c33f6c Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 18 May 2023 10:24:42 -0500 Subject: [PATCH 01/27] add ThreadLocalCache --- include/kernels/gauge_stout.cuh | 5 +- include/kernels/gauge_utils.cuh | 1 + include/kernels/gauge_wilson_flow.cuh | 5 +- include/kernels/hisq_paths_force.cuh | 77 +++++------ include/targets/cuda/thread_local_cache.h | 132 +++++++++++++++++++ include/targets/generic/thread_local_cache.h | 68 ++++++++++ 6 files changed, 247 insertions(+), 41 deletions(-) create mode 100644 include/targets/cuda/thread_local_cache.h create mode 100644 include/targets/generic/thread_local_cache.h diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh index 45cc176d88..ca22d0e9ac 100644 --- a/include/kernels/gauge_stout.cuh +++ b/include/kernels/gauge_stout.cuh @@ -6,6 +6,7 @@ #include #include #include +#include namespace quda { @@ -134,8 +135,8 @@ namespace quda } Link U, Q; - SharedMemoryCache Stap(target::block_dim()); - SharedMemoryCache Rect(target::block_dim(), sizeof(Link)); + ThreadLocalCache Stap{}; + ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh index 48c7e6c1cc..6e91e1ac31 100644 --- a/include/kernels/gauge_utils.cuh +++ b/include/kernels/gauge_utils.cuh @@ -2,6 +2,7 @@ #include #include #include +#include namespace quda { diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh index 327f7c7eb0..1799d3c0dd 100644 --- a/include/kernels/gauge_wilson_flow.cuh +++ b/include/kernels/gauge_wilson_flow.cuh @@ -4,6 +4,7 @@ #include #include #include +#include namespace quda { @@ -71,8 +72,8 @@ namespace quda // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: // https://arxiv.org/abs/0801.1165 - SharedMemoryCache Stap(target::block_dim()); - SharedMemoryCache Rect(target::block_dim(), sizeof(Link)); // offset to ensure non-overlapping allocations + ThreadLocalCache Stap{}; + ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim); Z = arg.coeff1x1 * static_cast(Stap) + arg.coeff2x1 * static_cast(Rect); break; diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh index cac909bf8a..522442da3b 100644 --- a/include/kernels/hisq_paths_force.cuh +++ b/include/kernels/hisq_paths_force.cuh @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace quda { @@ -272,7 +272,7 @@ namespace quda { * A _______ B * mu_next | | * H| |G - * + * * Variables have been named to reflection dimensionality for * mu_positive == true, sig_positive == true, mu_next_positive == true **************************************************************************/ @@ -372,7 +372,7 @@ namespace quda { @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction @param[in] parity_a Parity of the coordinate x @param[in/out] force_mu Accumulated force in the mu direction - @param[in] Uab_cache Shared memory cache that stores the gauge link going from a to b (read) + @param[in] Uab_cache Thread local cache that stores the gauge link going from a to b (read) @details This subset of the code computes the Lepage contribution to the fermion force. Data traffic: READ: cb_link, id_link, pMu_at_c @@ -386,7 +386,8 @@ namespace quda { Flops: 2 multiplies, 1 add, 1 rescale */ - __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu, SharedMemoryCache &Uab_cache) { + template + __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu, LinkCache &Uab_cache) { int point_b = linkExtendedIndexShiftMILC(x, arg.sig, arg); int parity_b = 1 - parity_a; @@ -440,7 +441,7 @@ namespace quda { @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction @param[in] parity_a Parity of the coordinate x - @param[in] Uab_cache Shared memory cache that stores the gauge link going from a to b (read) + @param[in] Uab_cache Thread local cache that stores the gauge link going from a to b (read) Data traffic: READ: gb_link, oProd_at_h WRITE: pMu_next_at_b, p3_at_a @@ -454,7 +455,8 @@ namespace quda { Flops: 2 multiplies, 1 add, 1 rescale */ - __device__ __host__ inline void middle_three(int x[4], int point_a, int parity_a, SharedMemoryCache &Uab_cache) + template + __device__ __host__ inline void middle_three(int x[4], int point_a, int parity_a, LinkCache &Uab_cache) { int point_b = linkExtendedIndexShiftMILC(x, arg.sig, arg); int parity_b = 1 - parity_a; @@ -535,8 +537,8 @@ namespace quda { /* * The "extra" low point corresponds to the Lepage contribution to the * force_mu term. - * - * + * + * * sig * F E * | | @@ -557,7 +559,7 @@ namespace quda { int point_a = e_cb; int parity_a = parity; - SharedMemoryCache Uab_cache(target::block_dim()); + ThreadLocalCache Uab_cache{}; // Scoped load of Uab { int point_b = linkExtendedIndexShiftMILC(x, arg.sig, arg); @@ -636,7 +638,7 @@ namespace quda { Link force; Link shortP; Link p5; - + const Link pMu; // double-buffer: read pNuMu, qNuMu for side 5, middle 7 @@ -688,7 +690,7 @@ namespace quda { @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction @param[in] parity_a Parity of the coordinate x - @param[in/out] Matrix_cache Shared memory cache that maintains the accumulated P5 contribution (write) + @param[in/out] Matrix_cache Thread local cache that maintains the accumulated P5 contribution (write) the gauge link going from a to b (read), as well as force_sig when sig is positive (read/write) @details This subset of the code computes the full seven link contribution to the HISQ force. Data traffic: @@ -705,8 +707,8 @@ namespace quda { Flops: 4 multiplies, 2 adds, 2 rescales */ - __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, - SharedMemoryCache &Matrix_cache) { + template + __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) { auto mycoeff_seven = parity_sign(parity_a) * coeff_sign(parity_a) * arg.coeff_seven; int point_b = linkExtendedIndexShiftMILC(x, arg.sig, arg); @@ -735,19 +737,19 @@ namespace quda { UbeOeOf = Ube * OeOf; // Cache Ube to below - Matrix_cache.save_z(Ube, 1); + Matrix_cache[1] = Ube; } // Take care of force_sig --- contribution from the negative rho direction Link Uaf = arg.link(arg.rho, point_a, parity_a); if constexpr (sig_positive) { - Link force_sig = Matrix_cache.load_z(2); + Link force_sig = Matrix_cache[2]; force_sig = mm_add(mycoeff_seven * UbeOeOf, conj(Uaf), force_sig); - Matrix_cache.save_z(force_sig, 2); + Matrix_cache[2] = force_sig; } // Compute the force_rho --- contribution from the negative rho direction - Link Uab = Matrix_cache.load_z(0); + Link Uab = Matrix_cache[0]; if constexpr (!sig_positive) Uab = conj(Uab); Link force_rho = arg.force(arg.rho, point_a, parity_a); force_rho = mm_add(mycoeff_seven * conj(UbeOeOf), conj(Uab), force_rho); @@ -756,7 +758,7 @@ namespace quda { Link Ufe = arg.link(arg.sig, fe_link_nbr_idx, fe_link_nbr_parity); // Load Ube from the cache - Link Ube = Matrix_cache.load_z(1); + Link Ube = Matrix_cache[1]; // Form the product UfeUebOb Link UfeUeb = (sig_positive ? Ufe : conj(Ufe)) * conj(Ube); @@ -788,7 +790,7 @@ namespace quda { Link Oz = Ucb * Ob; Link Oy = (sig_positive ? Udc : conj(Udc)) * Oz; p5_sig = mm_add(arg.accumu_coeff_seven * conj(Uda), Oy, p5_sig); - Matrix_cache.save_z(p5_sig, 1); + Matrix_cache[1] = p5_sig; // When sig is positive, compute the force_sig contribution from the // positive rho direction @@ -796,9 +798,9 @@ namespace quda { Link Od = arg.qNuMu(0, point_d, parity_d); Link Oc = arg.pNuMu(0, point_c, parity_c); Link Oz = conj(Ucb) * Oc; - Link force_sig = Matrix_cache.load_z(2); + Link force_sig = Matrix_cache[2]; force_sig = mm_add(mycoeff_seven * Oz, Od * Uda, force_sig); - Matrix_cache.save_z(force_sig, 2); + Matrix_cache[2] = force_sig; } } @@ -808,7 +810,7 @@ namespace quda { @param[in] x Local coordinate @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice @param[in] parity_a Parity of the coordinate x - @param[in/out] Matrix_cache Shared memory cache that maintains the full P5 contribution + @param[in/out] Matrix_cache Thread local cache that maintains the full P5 contribution summed from the previous middle five and all seven (read), as well as force_sig when sig is positive (read/write) @details This subset of the code computes the side link five link contribution to the HISQ force. @@ -818,7 +820,8 @@ namespace quda { Flops: 2 multiplies, 2 adds, 2 rescales */ - __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, SharedMemoryCache &Matrix_cache) { + template + __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) { int y[4] = {x[0], x[1], x[2], x[3]}; int point_h = updateCoordExtendedIndexShiftMILC(y, arg.nu, arg); int parity_h = 1 - parity_a; @@ -832,7 +835,7 @@ namespace quda { int qh_link_nbr_idx = mu_positive ? point_q : point_h; int qh_link_nbr_parity = mu_positive ? parity_q : parity_h; - Link P5 = Matrix_cache.load_z(1); + Link P5 = Matrix_cache[1]; Link Uah = arg.link(arg.nu, ha_link_nbr_idx, ha_link_nbr_parity); Link Ow = nu_positive ? Uah * P5 : conj(Uah) * P5; @@ -857,7 +860,7 @@ namespace quda { @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction @param[in] parity_a Parity of the coordinate x - @param[in/out] Matrix_cache Helper shared memory cache that maintains the gauge link going + @param[in/out] Matrix_cache Thread local cache that maintains the gauge link going from a to b (read) and, when sig is positive, force_sig (read/write) @details This subset of the code computes the middle link five link contribution to the HISQ force. Data traffic: @@ -870,8 +873,8 @@ namespace quda { Flops: 1 multiply, 1 add, 1 rescale */ - __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, - SharedMemoryCache &Matrix_cache) { + template + __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) { int point_b = linkExtendedIndexShiftMILC(x, arg.sig, arg); int parity_b = 1 - parity_a; @@ -902,7 +905,7 @@ namespace quda { arg.pNuMu_next(0, point_b, parity_b) = Ow; { // scoped Uab load - Link Uab = Matrix_cache.load_z(0); + Link Uab = Matrix_cache[0]; if constexpr (!sig_positive) Uab = conj(Uab); arg.p5(0, point_a, parity_a) = Uab * Ow; } @@ -917,9 +920,9 @@ namespace quda { // compute the force in the sigma direction if sig is positive if constexpr (sig_positive) { - Link force_sig = Matrix_cache.load_z(2); + Link force_sig = Matrix_cache[2]; force_sig = mm_add(arg.coeff_five * Ow, Ox, force_sig); - Matrix_cache.save_z(force_sig, 2); + Matrix_cache[2] = force_sig; } } @@ -955,14 +958,14 @@ namespace quda { int point_a = e_cb; int parity_a = parity; - + // calculate p5_sig - auto block_dim = target::block_dim(); - block_dim.z = (sig_positive ? 3 : 2); - SharedMemoryCache Matrix_cache(block_dim); + constexpr int cacheLen = sig_positive ? 3 : 2; + ThreadLocalCache> Matrix_cache{}; + if constexpr (sig_positive) { Link force_sig = arg.force(arg.sig, point_a, parity_a); - Matrix_cache.save_z(force_sig, 2); + Matrix_cache[2] = force_sig; } // Scoped load of Uab @@ -972,7 +975,7 @@ namespace quda { int ab_link_nbr_idx = (sig_positive) ? point_a : point_b; int ab_link_nbr_parity = (sig_positive) ? parity_a : parity_b; Link Uab = arg.link(arg.sig, ab_link_nbr_idx, ab_link_nbr_parity); - Matrix_cache.save_z(Uab, 0); + Matrix_cache[0] = Uab; } // accumulate into P5, force_sig @@ -987,7 +990,7 @@ namespace quda { // update the force in the sigma direction if constexpr (sig_positive) { - Link force_sig = Matrix_cache.load_z(2); + Link force_sig = Matrix_cache[2]; arg.force(arg.sig, point_a, parity_a) = force_sig; } diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h new file mode 100644 index 0000000000..d730c3b4e7 --- /dev/null +++ b/include/targets/cuda/thread_local_cache.h @@ -0,0 +1,132 @@ +#pragma once + +#include + +/** + @file thread_local_cache.h + + Thread local cache object which may use shared memory for optimization. + */ + +namespace quda +{ + + /** + @brief Class for threads to store a unique value which can use + shared memory for optimization purposes. + */ + template class ThreadLocalCache + { + public: + using value_type = T; + using offset_type = O; // type of object that may also use shared memory at the same which is created before this one + + private: + const unsigned int offset = 0; // dynamic offset in bytes + + /** + @brief This is a dummy instantiation for the host compiler + */ + template struct cache_dynamic { + T *operator()(unsigned) + { + static T *cache_; + return cache_; + } + }; + + /** + @brief This is the handle to the shared memory, dynamic specialization + @return Shared memory pointer + */ + template struct cache_dynamic { + __device__ inline T *operator()(unsigned int offset) + { + extern __shared__ int cache_[]; + return reinterpret_cast(reinterpret_cast(cache_) + offset); + } + }; + + __device__ __host__ inline T * cache() const + { + return target::dispatch(offset); + } + + __device__ __host__ inline void save_detail(const T &a) const + { + int j = target::thread_idx_linear<3>(); + cache()[j] = a; + } + + __device__ __host__ inline T &load_detail() const + { + int j = target::thread_idx_linear<3>(); + return cache()[j]; + } + + public: + static constexpr unsigned int get_offset() { + if constexpr(std::is_same_v) { + return 0; + } else { + return O::size(); + } + } + + static constexpr unsigned int size() { + return get_offset() + target::block_size<3>() * sizeof(T); + } + + /** + @brief Constructor for ThreadLocalCache. + */ + constexpr ThreadLocalCache() : offset(get_offset()) {} + + /** + @brief Grab the raw base address to this cache. + */ + __device__ __host__ inline auto data() const { return reinterpret_cast(cache()); } + + /** + @brief Save the value into the thread local cache. + @param[in] a The value to store in the thread local cache + */ + __device__ __host__ inline void save(const T &a) const + { + save_detail(a); + } + + /** + @brief Load a value from the thread local cache + @return The value at the linear thread index + */ + __device__ __host__ inline T &load() const + { + return load_detail(); + } + + /** + @brief Cast operator to allow cache objects to be used where T + is expected + */ + __device__ __host__ operator T() const { return load(); } + //__device__ __host__ operator T() const { T a; return a; } + + /** + @brief Assignment operator to allow cache objects to be used on + the lhs where T is otherwise expected. + */ + __device__ __host__ void operator=(const T &src) const { save(src); } + //__device__ __host__ void operator=(const T &src) const { ; } + + /** + @brief Subscripting operator returning reference to allow cache objects + to assign to a subscripted element. + */ + __device__ __host__ auto& operator[](int i) { return load()[i]; } + }; + +} // namespace quda + +// include overloads +#include "../generic/thread_local_cache.h" diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h new file mode 100644 index 0000000000..f4af1b0a21 --- /dev/null +++ b/include/targets/generic/thread_local_cache.h @@ -0,0 +1,68 @@ +#include + +/** + @file thread_local_cache_helper.h + @brief Convenience overloads to allow ThreadLocalCache objects to + appear in simple expressions. The actual implementation of + ThreadLocalCache is target specific, and located in e.g., + include/targets/cuda/thread_local_cache.h, etc. + */ + +namespace quda +{ + + template + __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) + { + return static_cast(a) + b; + } + + template + __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache &b) + { + return a + static_cast(b); + } + + template + __device__ __host__ inline T operator-(const ThreadLocalCache &a, const T &b) + { + return static_cast(a) - b; + } + + template + __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache &b) + { + return a - static_cast(b); + } + + template + __device__ __host__ inline auto operator+=(ThreadLocalCache &a, const T &b) + { + a.save(static_cast(a) + b); + return a; + } + + template + __device__ __host__ inline auto operator-=(ThreadLocalCache &a, const T &b) + { + a.save(static_cast(a) - b); + return a; + } + + template + __device__ __host__ inline auto conj(const ThreadLocalCache &a) + { + return conj(static_cast(a)); + } + + /** + @brief Uniform helper for exposing type T, whether we are dealing + with an instance of T or ThreadLocalCache + */ + template + struct get_type< + T, std::enable_if_t>>> { + using type = typename T::value_type; + }; + +} // namespace quda From 8f297f1c7db49671f6d2e2eb6eb006cfaf42f2cc Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 18 May 2023 13:14:37 -0500 Subject: [PATCH 02/27] fix CI error --- include/targets/cuda/thread_local_cache.h | 37 ++++++++------------ include/targets/generic/thread_local_cache.h | 24 +++++-------- 2 files changed, 22 insertions(+), 39 deletions(-) diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h index d730c3b4e7..7b3a51c547 100644 --- a/include/targets/cuda/thread_local_cache.h +++ b/include/targets/cuda/thread_local_cache.h @@ -47,10 +47,7 @@ namespace quda } }; - __device__ __host__ inline T * cache() const - { - return target::dispatch(offset); - } + __device__ __host__ inline T *cache() const { return target::dispatch(offset); } __device__ __host__ inline void save_detail(const T &a) const { @@ -64,23 +61,23 @@ namespace quda return cache()[j]; } - public: - static constexpr unsigned int get_offset() { - if constexpr(std::is_same_v) { - return 0; - } else { - return O::size(); - } + static constexpr unsigned int get_offset(dim3 block) + { + unsigned int o = 0; + if constexpr (!std::is_same_v) { o = O::shared_mem_size(block); } + return o; } - static constexpr unsigned int size() { - return get_offset() + target::block_size<3>() * sizeof(T); + public: + static constexpr unsigned int shared_mem_size(dim3 block) + { + return get_offset(block) + block.x * block.y * block.z * sizeof(T); } /** @brief Constructor for ThreadLocalCache. */ - constexpr ThreadLocalCache() : offset(get_offset()) {} + constexpr ThreadLocalCache() : offset(get_offset(target::block_dim())) { } /** @brief Grab the raw base address to this cache. @@ -91,19 +88,13 @@ namespace quda @brief Save the value into the thread local cache. @param[in] a The value to store in the thread local cache */ - __device__ __host__ inline void save(const T &a) const - { - save_detail(a); - } + __device__ __host__ inline void save(const T &a) const { save_detail(a); } /** @brief Load a value from the thread local cache @return The value at the linear thread index */ - __device__ __host__ inline T &load() const - { - return load_detail(); - } + __device__ __host__ inline T &load() const { return load_detail(); } /** @brief Cast operator to allow cache objects to be used where T @@ -123,7 +114,7 @@ namespace quda @brief Subscripting operator returning reference to allow cache objects to assign to a subscripted element. */ - __device__ __host__ auto& operator[](int i) { return load()[i]; } + __device__ __host__ auto &operator[](int i) { return load()[i]; } }; } // namespace quda diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index f4af1b0a21..bc9cbc2e7e 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -11,46 +11,39 @@ namespace quda { - template - __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) + template __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) { return static_cast(a) + b; } - template - __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache &b) + template __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache &b) { return a + static_cast(b); } - template - __device__ __host__ inline T operator-(const ThreadLocalCache &a, const T &b) + template __device__ __host__ inline T operator-(const ThreadLocalCache &a, const T &b) { return static_cast(a) - b; } - template - __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache &b) + template __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache &b) { return a - static_cast(b); } - template - __device__ __host__ inline auto operator+=(ThreadLocalCache &a, const T &b) + template __device__ __host__ inline auto operator+=(ThreadLocalCache &a, const T &b) { a.save(static_cast(a) + b); return a; } - template - __device__ __host__ inline auto operator-=(ThreadLocalCache &a, const T &b) + template __device__ __host__ inline auto operator-=(ThreadLocalCache &a, const T &b) { a.save(static_cast(a) - b); return a; } - template - __device__ __host__ inline auto conj(const ThreadLocalCache &a) + template __device__ __host__ inline auto conj(const ThreadLocalCache &a) { return conj(static_cast(a)); } @@ -60,8 +53,7 @@ namespace quda with an instance of T or ThreadLocalCache */ template - struct get_type< - T, std::enable_if_t>>> { + struct get_type>>> { using type = typename T::value_type; }; From 1a3614c9c484a375b2abd387aae6fd483dd62f37 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Mon, 22 May 2023 16:21:22 -0500 Subject: [PATCH 03/27] add coalescing to ThreadLocalCache --- include/targets/cuda/thread_local_cache.h | 31 +++++++++++++++++------ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h index 7b3a51c547..a0bb4a2340 100644 --- a/include/targets/cuda/thread_local_cache.h +++ b/include/targets/cuda/thread_local_cache.h @@ -22,15 +22,22 @@ namespace quda using offset_type = O; // type of object that may also use shared memory at the same which is created before this one private: + using atom_t = std::conditional_t>; + static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types"); + + // The number of elements of type atom_t that we break T into for optimal shared-memory access + static constexpr int n_element = sizeof(T) / sizeof(atom_t); + + const int stride; const unsigned int offset = 0; // dynamic offset in bytes /** @brief This is a dummy instantiation for the host compiler */ template struct cache_dynamic { - T *operator()(unsigned) + atom_t *operator()(unsigned) { - static T *cache_; + static atom_t *cache_; return cache_; } }; @@ -40,25 +47,33 @@ namespace quda @return Shared memory pointer */ template struct cache_dynamic { - __device__ inline T *operator()(unsigned int offset) + __device__ inline atom_t *operator()(unsigned int offset) { extern __shared__ int cache_[]; - return reinterpret_cast(reinterpret_cast(cache_) + offset); + return reinterpret_cast(reinterpret_cast(cache_) + offset); } }; - __device__ __host__ inline T *cache() const { return target::dispatch(offset); } + __device__ __host__ inline atom_t *cache() const { return target::dispatch(offset); } __device__ __host__ inline void save_detail(const T &a) const { + atom_t tmp[n_element]; + memcpy(tmp, (void *)&a, sizeof(T)); int j = target::thread_idx_linear<3>(); - cache()[j] = a; +#pragma unroll + for (int i = 0; i < n_element; i++) cache()[i * stride + j] = tmp[i]; } __device__ __host__ inline T &load_detail() const { + atom_t tmp[n_element]; int j = target::thread_idx_linear<3>(); - return cache()[j]; +#pragma unroll + for (int i = 0; i < n_element; i++) tmp[i] = cache()[i * stride + j]; + T a; + memcpy((void *)&a, tmp, sizeof(T)); + return a; } static constexpr unsigned int get_offset(dim3 block) @@ -77,7 +92,7 @@ namespace quda /** @brief Constructor for ThreadLocalCache. */ - constexpr ThreadLocalCache() : offset(get_offset(target::block_dim())) { } + constexpr ThreadLocalCache() : stride(target::block_size<3>()), offset(get_offset(target::block_dim())) { } /** @brief Grab the raw base address to this cache. From c2235ee18d3beb4104217435e94a7354883af603 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 23 May 2023 16:15:26 -0500 Subject: [PATCH 04/27] don't return reference for ThreadLocalCache indexing --- include/kernels/gauge_stout.cuh | 2 +- include/kernels/gauge_wilson_flow.cuh | 2 +- include/kernels/hisq_paths_force.cuh | 20 ++++---- include/targets/cuda/thread_local_cache.h | 50 +++++++++++++++----- include/targets/generic/thread_local_cache.h | 16 +++---- 5 files changed, 57 insertions(+), 33 deletions(-) diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh index ca22d0e9ac..56bd00f425 100644 --- a/include/kernels/gauge_stout.cuh +++ b/include/kernels/gauge_stout.cuh @@ -136,7 +136,7 @@ namespace quda Link U, Q; ThreadLocalCache Stap{}; - ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations + ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh index 1799d3c0dd..050295f271 100644 --- a/include/kernels/gauge_wilson_flow.cuh +++ b/include/kernels/gauge_wilson_flow.cuh @@ -73,7 +73,7 @@ namespace quda // and the 1x2 and 2x1 rectangles of length 5. From the following paper: // https://arxiv.org/abs/0801.1165 ThreadLocalCache Stap{}; - ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations + ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim); Z = arg.coeff1x1 * static_cast(Stap) + arg.coeff2x1 * static_cast(Rect); break; diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh index 522442da3b..bdba9d6bbe 100644 --- a/include/kernels/hisq_paths_force.cuh +++ b/include/kernels/hisq_paths_force.cuh @@ -415,7 +415,7 @@ namespace quda { Link Ow = mu_positive ? (conj(Ucb) * Oc) : (Ucb * Oc); { - Link Uab = Uab_cache.load(); + Link Uab = Uab_cache; Link Oy = sig_positive ? Uab * Ow : conj(Uab) * Ow; Link Ox = mu_positive ? (Oy * Uid) : (Uid * conj(Oy)); auto mycoeff_lepage = -coeff_sign(parity_a)*coeff_sign(parity_a)*arg.coeff_lepage; @@ -489,7 +489,7 @@ namespace quda { arg.pMu_next(0, point_b, parity_b) = Oz; { // scoped Uab load - Link Uab = Uab_cache.load(); + Link Uab = Uab_cache; if constexpr (!sig_positive) Uab = conj(Uab); arg.p3(0, point_a, parity_a) = Uab * Oz; } @@ -737,7 +737,7 @@ namespace quda { UbeOeOf = Ube * OeOf; // Cache Ube to below - Matrix_cache[1] = Ube; + Matrix_cache.save(Ube, 1); } // Take care of force_sig --- contribution from the negative rho direction @@ -745,7 +745,7 @@ namespace quda { if constexpr (sig_positive) { Link force_sig = Matrix_cache[2]; force_sig = mm_add(mycoeff_seven * UbeOeOf, conj(Uaf), force_sig); - Matrix_cache[2] = force_sig; + Matrix_cache.save(force_sig, 2); } // Compute the force_rho --- contribution from the negative rho direction @@ -790,7 +790,7 @@ namespace quda { Link Oz = Ucb * Ob; Link Oy = (sig_positive ? Udc : conj(Udc)) * Oz; p5_sig = mm_add(arg.accumu_coeff_seven * conj(Uda), Oy, p5_sig); - Matrix_cache[1] = p5_sig; + Matrix_cache.save(p5_sig, 1); // When sig is positive, compute the force_sig contribution from the // positive rho direction @@ -800,7 +800,7 @@ namespace quda { Link Oz = conj(Ucb) * Oc; Link force_sig = Matrix_cache[2]; force_sig = mm_add(mycoeff_seven * Oz, Od * Uda, force_sig); - Matrix_cache[2] = force_sig; + Matrix_cache.save(force_sig, 2); } } @@ -922,7 +922,7 @@ namespace quda { if constexpr (sig_positive) { Link force_sig = Matrix_cache[2]; force_sig = mm_add(arg.coeff_five * Ow, Ox, force_sig); - Matrix_cache[2] = force_sig; + Matrix_cache.save(force_sig, 2); } } @@ -961,11 +961,11 @@ namespace quda { // calculate p5_sig constexpr int cacheLen = sig_positive ? 3 : 2; - ThreadLocalCache> Matrix_cache{}; + ThreadLocalCache Matrix_cache{}; if constexpr (sig_positive) { Link force_sig = arg.force(arg.sig, point_a, parity_a); - Matrix_cache[2] = force_sig; + Matrix_cache.save(force_sig, 2); } // Scoped load of Uab @@ -975,7 +975,7 @@ namespace quda { int ab_link_nbr_idx = (sig_positive) ? point_a : point_b; int ab_link_nbr_parity = (sig_positive) ? parity_a : parity_b; Link Uab = arg.link(arg.sig, ab_link_nbr_idx, ab_link_nbr_parity); - Matrix_cache[0] = Uab; + Matrix_cache.save(Uab, 0); } // accumulate into P5, force_sig diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h index a0bb4a2340..45eeca41da 100644 --- a/include/targets/cuda/thread_local_cache.h +++ b/include/targets/cuda/thread_local_cache.h @@ -15,11 +15,13 @@ namespace quda @brief Class for threads to store a unique value which can use shared memory for optimization purposes. */ - template class ThreadLocalCache + template class ThreadLocalCache { public: using value_type = T; using offset_type = O; // type of object that may also use shared memory at the same which is created before this one + static constexpr int N = N_; + static constexpr int len = std::max(1,N); private: using atom_t = std::conditional_t>; @@ -56,21 +58,21 @@ namespace quda __device__ __host__ inline atom_t *cache() const { return target::dispatch(offset); } - __device__ __host__ inline void save_detail(const T &a) const + __device__ __host__ inline void save_detail(const T &a, const int k) const { atom_t tmp[n_element]; memcpy(tmp, (void *)&a, sizeof(T)); int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) cache()[i * stride + j] = tmp[i]; + for (int i = 0; i < n_element; i++) cache()[(k*n_element + i) * stride + j] = tmp[i]; } - __device__ __host__ inline T &load_detail() const + __device__ __host__ inline T load_detail(const int k) const { atom_t tmp[n_element]; int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = cache()[i * stride + j]; + for (int i = 0; i < n_element; i++) tmp[i] = cache()[(k*n_element + i) * stride + j]; T a; memcpy((void *)&a, tmp, sizeof(T)); return a; @@ -86,7 +88,7 @@ namespace quda public: static constexpr unsigned int shared_mem_size(dim3 block) { - return get_offset(block) + block.x * block.y * block.z * sizeof(T); + return get_offset(block) + len * block.x * block.y * block.z * sizeof(T); } /** @@ -103,33 +105,55 @@ namespace quda @brief Save the value into the thread local cache. @param[in] a The value to store in the thread local cache */ - __device__ __host__ inline void save(const T &a) const { save_detail(a); } + __device__ __host__ inline void save(const T &a) const { + static_assert(N == 0); + save_detail(a, 0); + } + + /** + @brief Save the value into the thread local cache. + @param[in] a The value to store in the thread local cache + */ + __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); } + + /** + @brief Load a value from the thread local cache + @return The value at the linear thread index + */ + __device__ __host__ inline T load() const { + static_assert(N == 0); + return load_detail(0); + } /** @brief Load a value from the thread local cache @return The value at the linear thread index */ - __device__ __host__ inline T &load() const { return load_detail(); } + __device__ __host__ inline T load(const int k) const { return load_detail(k); } /** @brief Cast operator to allow cache objects to be used where T is expected */ - __device__ __host__ operator T() const { return load(); } - //__device__ __host__ operator T() const { T a; return a; } + __device__ __host__ operator T() const { + static_assert(N == 0); + return load(0); + } /** @brief Assignment operator to allow cache objects to be used on the lhs where T is otherwise expected. */ - __device__ __host__ void operator=(const T &src) const { save(src); } - //__device__ __host__ void operator=(const T &src) const { ; } + __device__ __host__ void operator=(const T &src) const { + static_assert(N == 0); + save(src, 0); + } /** @brief Subscripting operator returning reference to allow cache objects to assign to a subscripted element. */ - __device__ __host__ auto &operator[](int i) { return load()[i]; } + __device__ __host__ auto operator[](int i) { return load(i); } }; } // namespace quda diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index bc9cbc2e7e..f2aa515801 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -11,39 +11,39 @@ namespace quda { - template __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) + template __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) { return static_cast(a) + b; } - template __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache &b) + template __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache &b) { return a + static_cast(b); } - template __device__ __host__ inline T operator-(const ThreadLocalCache &a, const T &b) + template __device__ __host__ inline T operator-(const ThreadLocalCache &a, const T &b) { return static_cast(a) - b; } - template __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache &b) + template __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache &b) { return a - static_cast(b); } - template __device__ __host__ inline auto operator+=(ThreadLocalCache &a, const T &b) + template __device__ __host__ inline auto operator+=(ThreadLocalCache &a, const T &b) { a.save(static_cast(a) + b); return a; } - template __device__ __host__ inline auto operator-=(ThreadLocalCache &a, const T &b) + template __device__ __host__ inline auto operator-=(ThreadLocalCache &a, const T &b) { a.save(static_cast(a) - b); return a; } - template __device__ __host__ inline auto conj(const ThreadLocalCache &a) + template __device__ __host__ inline auto conj(const ThreadLocalCache &a) { return conj(static_cast(a)); } @@ -53,7 +53,7 @@ namespace quda with an instance of T or ThreadLocalCache */ template - struct get_type>>> { + struct get_type>>> { using type = typename T::value_type; }; From f686285341056eada006a6e55246fad137308554 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Wed, 24 May 2023 13:02:54 -0500 Subject: [PATCH 05/27] update docs --- include/targets/cuda/thread_local_cache.h | 41 ++++++++++++----------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h index 45eeca41da..286f019a28 100644 --- a/include/targets/cuda/thread_local_cache.h +++ b/include/targets/cuda/thread_local_cache.h @@ -6,26 +6,27 @@ @file thread_local_cache.h Thread local cache object which may use shared memory for optimization. + The storage can be a single object or an array of objects. */ namespace quda { /** - @brief Class for threads to store a unique value which can use + @brief Class for threads to store a unique value, or array of values, which can use shared memory for optimization purposes. */ template class ThreadLocalCache { public: using value_type = T; - using offset_type = O; // type of object that may also use shared memory at the same which is created before this one - static constexpr int N = N_; - static constexpr int len = std::max(1,N); + using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one + static constexpr int N = N_; // size of array, 0 means to behave like T instead of array + static constexpr int len = std::max(1,N); // actual number of elements to store private: using atom_t = std::conditional_t>; - static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types"); + static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types"); // The number of elements of type atom_t that we break T into for optimal shared-memory access static constexpr int n_element = sizeof(T) / sizeof(atom_t); @@ -102,7 +103,7 @@ namespace quda __device__ __host__ inline auto data() const { return reinterpret_cast(cache()); } /** - @brief Save the value into the thread local cache. + @brief Save the value into the thread local cache. Used when N==0 so cache acts like single object. @param[in] a The value to store in the thread local cache */ __device__ __host__ inline void save(const T &a) const { @@ -111,14 +112,15 @@ namespace quda } /** - @brief Save the value into the thread local cache. + @brief Save the value into an element of the thread local cache. @param[in] a The value to store in the thread local cache + @param[in] k The index to use */ __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); } /** - @brief Load a value from the thread local cache - @return The value at the linear thread index + @brief Load a value from the thread local cache. Used when N==0 so cache acts like single object. + @return The value stored in the thread local cache */ __device__ __host__ inline T load() const { static_assert(N == 0); @@ -126,34 +128,35 @@ namespace quda } /** - @brief Load a value from the thread local cache - @return The value at the linear thread index + @brief Load a value from an element of the thread local cache + @param[in] k The index to use + @return The value stored in the thread local cache at that index */ __device__ __host__ inline T load(const int k) const { return load_detail(k); } /** - @brief Cast operator to allow cache objects to be used where T - is expected + @brief Cast operator to allow cache objects to be used where T is expected (when N==0). */ __device__ __host__ operator T() const { static_assert(N == 0); - return load(0); + return load(); } /** @brief Assignment operator to allow cache objects to be used on - the lhs where T is otherwise expected. + the lhs where T is otherwise expected (when N==0). */ __device__ __host__ void operator=(const T &src) const { static_assert(N == 0); - save(src, 0); + save(src); } /** - @brief Subscripting operator returning reference to allow cache objects - to assign to a subscripted element. + @brief Subscripting operator returning value at index for convenience. + @param[in] i The index to use + @return The value stored in the thread local cache at that index */ - __device__ __host__ auto operator[](int i) { return load(i); } + __device__ __host__ T operator[](int i) { return load(i); } }; } // namespace quda From b16b9b2bc7e86cf03fae82e6d4793d2b0692efc9 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 11 Jul 2023 13:09:16 -0500 Subject: [PATCH 06/27] add shared memory helper --- include/targets/cuda/shared_memory_helper.h | 85 ++++++++++ include/targets/cuda/thread_local_cache.h | 164 ------------------- include/targets/generic/thread_local_cache.h | 139 +++++++++++++++- 3 files changed, 219 insertions(+), 169 deletions(-) create mode 100644 include/targets/cuda/shared_memory_helper.h diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h new file mode 100644 index 0000000000..bd8d919359 --- /dev/null +++ b/include/targets/cuda/shared_memory_helper.h @@ -0,0 +1,85 @@ +#pragma once + +#include + +/** + @file shared_memory_helper.h + + Target specific helper for allocating and accessing shared memory. + */ + +namespace quda +{ + + /** + @brief Class which is used to allocate and access shared memory. + The shared memory is treated as an array of type T, with the + number of elements given by the static member S::size(). The + offset from the beginning of the total shared memory block is + given by the static member O::shared_mem_size(block), or 0 if O + is void. + */ + template class SharedMemory + { + public: + using value_type = T; + + private: + T *data; + const unsigned int size; // number of elements of type T + + /** + @brief This is a dummy instantiation for the host compiler + */ + template struct cache_dynamic { + T *operator()(unsigned int) + { + static T *cache_; + return cache_; + } + }; + + /** + @brief This is the handle to the dynamic shared memory + @return Shared memory pointer + */ + template struct cache_dynamic { + __device__ inline T *operator()(unsigned int offset) + { + extern __shared__ int cache_[]; + return reinterpret_cast(reinterpret_cast(cache_) + offset); + } + }; + + __device__ __host__ inline T *cache(unsigned int offset) const + { + return target::dispatch(offset); + } + + static constexpr unsigned int get_offset(dim3 block) + { + unsigned int o = 0; + if constexpr (!std::is_same_v) { o = O::shared_mem_size(block); } + return o; + } + + public: + static constexpr unsigned int shared_mem_size(dim3 block) + { + return get_offset(block) + S::size()*sizeof(T); + } + + /** + @brief Constructor for SharedMemory object. + */ + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {} + + /** + @brief Subscripting operator returning a reference to element. + @param[in] i The index to use. + @return Reference to value stored at that index. + */ + __device__ __host__ T &operator[](const int i) const { return data[i]; } + }; + +} // namespace quda diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h index 286f019a28..dd4cd863fc 100644 --- a/include/targets/cuda/thread_local_cache.h +++ b/include/targets/cuda/thread_local_cache.h @@ -1,165 +1 @@ -#pragma once - -#include - -/** - @file thread_local_cache.h - - Thread local cache object which may use shared memory for optimization. - The storage can be a single object or an array of objects. - */ - -namespace quda -{ - - /** - @brief Class for threads to store a unique value, or array of values, which can use - shared memory for optimization purposes. - */ - template class ThreadLocalCache - { - public: - using value_type = T; - using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one - static constexpr int N = N_; // size of array, 0 means to behave like T instead of array - static constexpr int len = std::max(1,N); // actual number of elements to store - - private: - using atom_t = std::conditional_t>; - static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types"); - - // The number of elements of type atom_t that we break T into for optimal shared-memory access - static constexpr int n_element = sizeof(T) / sizeof(atom_t); - - const int stride; - const unsigned int offset = 0; // dynamic offset in bytes - - /** - @brief This is a dummy instantiation for the host compiler - */ - template struct cache_dynamic { - atom_t *operator()(unsigned) - { - static atom_t *cache_; - return cache_; - } - }; - - /** - @brief This is the handle to the shared memory, dynamic specialization - @return Shared memory pointer - */ - template struct cache_dynamic { - __device__ inline atom_t *operator()(unsigned int offset) - { - extern __shared__ int cache_[]; - return reinterpret_cast(reinterpret_cast(cache_) + offset); - } - }; - - __device__ __host__ inline atom_t *cache() const { return target::dispatch(offset); } - - __device__ __host__ inline void save_detail(const T &a, const int k) const - { - atom_t tmp[n_element]; - memcpy(tmp, (void *)&a, sizeof(T)); - int j = target::thread_idx_linear<3>(); -#pragma unroll - for (int i = 0; i < n_element; i++) cache()[(k*n_element + i) * stride + j] = tmp[i]; - } - - __device__ __host__ inline T load_detail(const int k) const - { - atom_t tmp[n_element]; - int j = target::thread_idx_linear<3>(); -#pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = cache()[(k*n_element + i) * stride + j]; - T a; - memcpy((void *)&a, tmp, sizeof(T)); - return a; - } - - static constexpr unsigned int get_offset(dim3 block) - { - unsigned int o = 0; - if constexpr (!std::is_same_v) { o = O::shared_mem_size(block); } - return o; - } - - public: - static constexpr unsigned int shared_mem_size(dim3 block) - { - return get_offset(block) + len * block.x * block.y * block.z * sizeof(T); - } - - /** - @brief Constructor for ThreadLocalCache. - */ - constexpr ThreadLocalCache() : stride(target::block_size<3>()), offset(get_offset(target::block_dim())) { } - - /** - @brief Grab the raw base address to this cache. - */ - __device__ __host__ inline auto data() const { return reinterpret_cast(cache()); } - - /** - @brief Save the value into the thread local cache. Used when N==0 so cache acts like single object. - @param[in] a The value to store in the thread local cache - */ - __device__ __host__ inline void save(const T &a) const { - static_assert(N == 0); - save_detail(a, 0); - } - - /** - @brief Save the value into an element of the thread local cache. - @param[in] a The value to store in the thread local cache - @param[in] k The index to use - */ - __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); } - - /** - @brief Load a value from the thread local cache. Used when N==0 so cache acts like single object. - @return The value stored in the thread local cache - */ - __device__ __host__ inline T load() const { - static_assert(N == 0); - return load_detail(0); - } - - /** - @brief Load a value from an element of the thread local cache - @param[in] k The index to use - @return The value stored in the thread local cache at that index - */ - __device__ __host__ inline T load(const int k) const { return load_detail(k); } - - /** - @brief Cast operator to allow cache objects to be used where T is expected (when N==0). - */ - __device__ __host__ operator T() const { - static_assert(N == 0); - return load(); - } - - /** - @brief Assignment operator to allow cache objects to be used on - the lhs where T is otherwise expected (when N==0). - */ - __device__ __host__ void operator=(const T &src) const { - static_assert(N == 0); - save(src); - } - - /** - @brief Subscripting operator returning value at index for convenience. - @param[in] i The index to use - @return The value stored in the thread local cache at that index - */ - __device__ __host__ T operator[](int i) { return load(i); } - }; - -} // namespace quda - -// include overloads #include "../generic/thread_local_cache.h" diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index f2aa515801..8df61aebc4 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -1,16 +1,145 @@ +#pragma once + +#include +#include #include /** - @file thread_local_cache_helper.h - @brief Convenience overloads to allow ThreadLocalCache objects to - appear in simple expressions. The actual implementation of - ThreadLocalCache is target specific, and located in e.g., - include/targets/cuda/thread_local_cache.h, etc. + @file thread_local_cache.h + + Thread local cache object which may use shared memory for optimization. + The storage can be a single object or an array of objects. */ namespace quda { + template + using atom_t = std::conditional_t>; + + template struct SizeStatic { + static constexpr unsigned int size() { return N; } + }; + + /** + @brief Class for threads to store a unique value, or array of values, which can use + shared memory for optimization purposes. + */ + template class ThreadLocalCache : SharedMemory, SizeStatic, O> + { + public: + using value_type = T; + using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one + static constexpr int N = N_; // size of array, 0 means to behave like T instead of array + static constexpr int len = std::max(1,N); // actual number of elements to store + using Smem = SharedMemory, SizeStatic, O>; + + private: + using atom_t = atom_t; + static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types"); + + // The number of elements of type atom_t that we break T into for optimal shared-memory access + static constexpr int n_element = sizeof(T) / sizeof(atom_t); + + const int stride; + + constexpr Smem smem() const { return *dynamic_cast(this); } + + __device__ __host__ inline void save_detail(const T &a, const int k) const + { + atom_t tmp[n_element]; + memcpy(tmp, (void *)&a, sizeof(T)); + int j = target::thread_idx_linear<3>(); +#pragma unroll + for (int i = 0; i < n_element; i++) smem()[(k*n_element + i) * stride + j] = tmp[i]; + } + + __device__ __host__ inline T load_detail(const int k) const + { + atom_t tmp[n_element]; + int j = target::thread_idx_linear<3>(); +#pragma unroll + for (int i = 0; i < n_element; i++) tmp[i] = smem()[(k*n_element + i) * stride + j]; + T a; + memcpy((void *)&a, tmp, sizeof(T)); + return a; + } + + static constexpr unsigned int get_offset(dim3 block) + { + unsigned int o = 0; + if constexpr (!std::is_same_v) { o = O::shared_mem_size(block); } + return o; + } + + public: + static constexpr unsigned int shared_mem_size(dim3 block) + { + return get_offset(block) + len * block.x * block.y * block.z * sizeof(T); + } + + /** + @brief Constructor for ThreadLocalCache. + */ + constexpr ThreadLocalCache() : stride(target::block_size<3>()) {} + + /** + @brief Save the value into the thread local cache. Used when N==0 so cache acts like single object. + @param[in] a The value to store in the thread local cache + */ + __device__ __host__ inline void save(const T &a) const { + static_assert(N == 0); + save_detail(a, 0); + } + + /** + @brief Save the value into an element of the thread local cache. + @param[in] a The value to store in the thread local cache + @param[in] k The index to use + */ + __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); } + + /** + @brief Load a value from the thread local cache. Used when N==0 so cache acts like single object. + @return The value stored in the thread local cache + */ + __device__ __host__ inline T load() const { + static_assert(N == 0); + return load_detail(0); + } + + /** + @brief Load a value from an element of the thread local cache + @param[in] k The index to use + @return The value stored in the thread local cache at that index + */ + __device__ __host__ inline T load(const int k) const { return load_detail(k); } + + /** + @brief Cast operator to allow cache objects to be used where T is expected (when N==0). + */ + __device__ __host__ operator T() const { + static_assert(N == 0); + return load(); + } + + /** + @brief Assignment operator to allow cache objects to be used on + the lhs where T is otherwise expected (when N==0). + */ + __device__ __host__ void operator=(const T &src) const { + static_assert(N == 0); + save(src); + } + + /** + @brief Subscripting operator returning value at index for convenience. + @param[in] i The index to use + @return The value stored in the thread local cache at that index + */ + __device__ __host__ T operator[](int i) { return load(i); } + }; + template __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) { return static_cast(a) + b; From d6ac933e38f1092a98e1c32f12f0c30b1c06fb28 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 11 Jul 2023 13:32:41 -0500 Subject: [PATCH 07/27] add shared memory helper to HIP --- include/targets/hip/shared_memory_helper.h | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 include/targets/hip/shared_memory_helper.h diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h new file mode 100644 index 0000000000..bd8d919359 --- /dev/null +++ b/include/targets/hip/shared_memory_helper.h @@ -0,0 +1,85 @@ +#pragma once + +#include + +/** + @file shared_memory_helper.h + + Target specific helper for allocating and accessing shared memory. + */ + +namespace quda +{ + + /** + @brief Class which is used to allocate and access shared memory. + The shared memory is treated as an array of type T, with the + number of elements given by the static member S::size(). The + offset from the beginning of the total shared memory block is + given by the static member O::shared_mem_size(block), or 0 if O + is void. + */ + template class SharedMemory + { + public: + using value_type = T; + + private: + T *data; + const unsigned int size; // number of elements of type T + + /** + @brief This is a dummy instantiation for the host compiler + */ + template struct cache_dynamic { + T *operator()(unsigned int) + { + static T *cache_; + return cache_; + } + }; + + /** + @brief This is the handle to the dynamic shared memory + @return Shared memory pointer + */ + template struct cache_dynamic { + __device__ inline T *operator()(unsigned int offset) + { + extern __shared__ int cache_[]; + return reinterpret_cast(reinterpret_cast(cache_) + offset); + } + }; + + __device__ __host__ inline T *cache(unsigned int offset) const + { + return target::dispatch(offset); + } + + static constexpr unsigned int get_offset(dim3 block) + { + unsigned int o = 0; + if constexpr (!std::is_same_v) { o = O::shared_mem_size(block); } + return o; + } + + public: + static constexpr unsigned int shared_mem_size(dim3 block) + { + return get_offset(block) + S::size()*sizeof(T); + } + + /** + @brief Constructor for SharedMemory object. + */ + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {} + + /** + @brief Subscripting operator returning a reference to element. + @param[in] i The index to use. + @return Reference to value stored at that index. + */ + __device__ __host__ T &operator[](const int i) const { return data[i]; } + }; + +} // namespace quda From f29ff0049123b12f9ac74ba7acbfc9e5133c0723 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 8 Aug 2023 18:21:51 -0500 Subject: [PATCH 08/27] refactor SharedMemoryCache to use new SharedMemory object --- include/kernels/color_spinor_pack.cuh | 16 +- include/kernels/dslash_clover_helper.cuh | 2 +- include/kernels/dslash_coarse.cuh | 2 +- include/kernels/dslash_domain_wall_m5.cuh | 10 +- include/kernels/dslash_mobius_eofa.cuh | 4 +- .../kernels/dslash_ndeg_twisted_clover.cuh | 2 +- ...ash_ndeg_twisted_clover_preconditioned.cuh | 2 +- ...slash_ndeg_twisted_mass_preconditioned.cuh | 2 +- include/kernels/hisq_paths_force.cuh | 4 +- .../targets/cuda/shared_memory_cache_helper.h | 294 ------------------ include/targets/cuda/shared_memory_helper.h | 15 +- include/targets/cuda/thread_array.h | 52 +--- include/targets/generic/helpers.h | 42 +++ .../generic/shared_memory_cache_helper.h | 269 ++++++++++++++-- include/targets/generic/thread_array.h | 41 +++ include/targets/generic/thread_local_cache.h | 32 +- 16 files changed, 386 insertions(+), 403 deletions(-) create mode 100644 include/targets/generic/helpers.h create mode 100644 include/targets/generic/thread_array.h diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh index 58ae6165d3..3d6ae5b6b2 100644 --- a/include/kernels/color_spinor_pack.cuh +++ b/include/kernels/color_spinor_pack.cuh @@ -166,6 +166,13 @@ namespace quda { } }; + struct DimsPadX { + static constexpr dim3 dims(dim3 block) { + block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); + return block; + } + }; + template <> struct site_max { template __device__ inline auto operator()(typename Arg::real thread_max, Arg &) { @@ -173,11 +180,12 @@ namespace quda { constexpr int Ms = spins_per_thread(); constexpr int Mc = colors_per_thread(); constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); - auto block = target::block_dim(); + //auto block = target::block_dim(); // pad the shared block size to avoid bank conflicts - block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); - block.y = color_spin_threads; // state the y block since we know it at compile time - SharedMemoryCache cache(block); + //block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); + //block.y = color_spin_threads; // state the y block since we know it at compile time + //SharedMemoryCache cache(block); + SharedMemoryCache cache; cache.save(thread_max); cache.sync(); real this_site_max = static_cast(0); diff --git a/include/kernels/dslash_clover_helper.cuh b/include/kernels/dslash_clover_helper.cuh index 00b61a9b3e..4cf6f1a311 100644 --- a/include/kernels/dslash_clover_helper.cuh +++ b/include/kernels/dslash_clover_helper.cuh @@ -203,7 +203,7 @@ namespace quda { Mat A = arg.clover(x_cb, clover_parity, chirality); - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; half_fermion in_chi[n_flavor]; // flavor array of chirally projected fermion #pragma unroll diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh index 9f90de7287..cd214d8fbd 100644 --- a/include/kernels/dslash_coarse.cuh +++ b/include/kernels/dslash_coarse.cuh @@ -299,7 +299,7 @@ namespace quda { template <> struct dim_collapse { template __device__ __host__ inline void operator()(T &out, int dir, int dim, const Arg &arg) { - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; // only need to write to shared memory if not master thread if (dim > 0 || dir) cache.save(out); diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh index bab21d4c11..dfce18bede 100644 --- a/include/kernels/dslash_domain_wall_m5.cuh +++ b/include/kernels/dslash_domain_wall_m5.cuh @@ -220,7 +220,7 @@ namespace quda if (mobius_m5::use_half_vector()) { // if using shared-memory caching then load spinor field for my site into cache typedef ColorSpinor HalfVector; - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; { // forwards direction constexpr int proj_dir = dagger ? +1 : -1; @@ -271,7 +271,7 @@ namespace quda } else { // use_half_vector // if using shared-memory caching then load spinor field for my site into cache - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; if (shared) { if (sync) { cache.sync(); } cache.save(in); @@ -377,7 +377,7 @@ namespace quda const auto inv = arg.inv; // if using shared-memory caching then load spinor field for my site into cache - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; if (shared) { // cache.save(arg.in(s_ * arg.volume_4d_cb + x_cb, parity)); if (sync) { cache.sync(); } @@ -436,7 +436,7 @@ namespace quda Vector out; if (mobius_m5::use_half_vector()) { - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; { // first do R constexpr int proj_dir = dagger ? -1 : +1; @@ -495,7 +495,7 @@ namespace quda out += l.reconstruct(4, proj_dir); } } else { // use_half_vector - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; if (shared) { if (sync) { cache.sync(); } cache.save(in); diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh index f5e0a5c8ac..3d62ec3923 100644 --- a/include/kernels/dslash_mobius_eofa.cuh +++ b/include/kernels/dslash_mobius_eofa.cuh @@ -107,7 +107,7 @@ namespace quda using real = typename Arg::real; typedef ColorSpinor Vector; - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; Vector out; cache.save(arg.in(s * arg.volume_4d_cb + x_cb, parity)); @@ -185,7 +185,7 @@ namespace quda typedef ColorSpinor Vector; const auto sherman_morrison = arg.sherman_morrison; - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; cache.save(arg.in(s * arg.volume_4d_cb + x_cb, parity)); cache.sync(); diff --git a/include/kernels/dslash_ndeg_twisted_clover.cuh b/include/kernels/dslash_ndeg_twisted_clover.cuh index 108f8c5e84..cb8bc61ad7 100644 --- a/include/kernels/dslash_ndeg_twisted_clover.cuh +++ b/include/kernels/dslash_ndeg_twisted_clover.cuh @@ -72,7 +72,7 @@ namespace quda // apply the chiral and flavor twists // use consistent load order across s to ensure better cache locality Vector x = arg.x(my_flavor_idx, my_spinor_parity); - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; cache.save(x); x.toRel(); // switch to chiral basis diff --git a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh index bdbff30817..ebd8f71da6 100644 --- a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh +++ b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh @@ -91,7 +91,7 @@ namespace quda int chirality = flavor; // relabel flavor as chirality - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; enum swizzle_direction { FORWARDS = 0, diff --git a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh index 98e72eb61a..8bab3d5623 100644 --- a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh +++ b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh @@ -95,7 +95,7 @@ namespace quda } if (!dagger || Arg::asymmetric) { // apply A^{-1} to D*in - SharedMemoryCache cache(target::block_dim()); + SharedMemoryCache cache; if (isComplete(arg, coord) && active) { // to apply the preconditioner we need to put "out" in shared memory so the other flavor can access it cache.save(out); diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh index bdba9d6bbe..84173a9a5a 100644 --- a/include/kernels/hisq_paths_force.cuh +++ b/include/kernels/hisq_paths_force.cuh @@ -559,7 +559,7 @@ namespace quda { int point_a = e_cb; int parity_a = parity; - ThreadLocalCache Uab_cache{}; + ThreadLocalCache Uab_cache; // Scoped load of Uab { int point_b = linkExtendedIndexShiftMILC(x, arg.sig, arg); @@ -961,7 +961,7 @@ namespace quda { // calculate p5_sig constexpr int cacheLen = sig_positive ? 3 : 2; - ThreadLocalCache Matrix_cache{}; + ThreadLocalCache Matrix_cache; if constexpr (sig_positive) { Link force_sig = arg.force(arg.sig, point_a, parity_a); diff --git a/include/targets/cuda/shared_memory_cache_helper.h b/include/targets/cuda/shared_memory_cache_helper.h index 7c7c0a1b28..73be0cd01b 100644 --- a/include/targets/cuda/shared_memory_cache_helper.h +++ b/include/targets/cuda/shared_memory_cache_helper.h @@ -1,295 +1 @@ -#pragma once - -#include -#include - -/** - @file shared_memory_cache_helper.h - - Helper functionality for aiding the use of the shared memory for - sharing data between threads in a thread block. - */ - -namespace quda -{ - - /** - @brief Class which wraps around a shared memory cache for type T, - where each thread in the thread block stores a unique value in - the cache which any other thread can access. - - This accessor supports both explicit run-time block size and - compile-time sizing. - - * For run-time block size, the constructor should be initialied - with the desired block size. - - * For compile-time block size, no arguments should be passed to - the constructor, and then the second and third template - parameters correspond to the y and z dimensions of the block, - respectively. The x dimension of the block will be set - according the maximum number of threads possible, given these - dimensions. - */ - template class SharedMemoryCache - { - public: - using value_type = T; - static constexpr int block_size_y = block_size_y_; - static constexpr int block_size_z = block_size_z_; - static constexpr bool dynamic = dynamic_; - - private: - /** maximum number of threads in x given the y and z block sizes */ - static constexpr int block_size_x = device::max_block_size(); - - using atom_t = std::conditional_t>; - static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types"); - - // The number of elements of type atom_t that we break T into for optimal shared-memory access - static constexpr int n_element = sizeof(T) / sizeof(atom_t); - - const dim3 block; - const int stride; - const unsigned int offset = 0; // dynamic offset in bytes - - /** - @brief This is a dummy instantiation for the host compiler - */ - template struct cache_dynamic { - atom_t *operator()(unsigned) - { - static atom_t *cache_; - return reinterpret_cast(cache_); - } - }; - - /** - @brief This is the handle to the shared memory, dynamic specialization - @return Shared memory pointer - */ - template struct cache_dynamic { - __device__ inline atom_t *operator()(unsigned int offset) - { - extern __shared__ int cache_[]; - return reinterpret_cast(reinterpret_cast(cache_) + offset); - } - }; - - /** - @brief This is a dummy instantiation for the host compiler - */ - template struct cache_static { - atom_t *operator()() - { - static atom_t *cache_; - return reinterpret_cast(cache_); - } - }; - - /** - @brief This is the handle to the shared memory, static specialization - @return Shared memory pointer - */ - template struct cache_static { - __device__ inline atom_t *operator()() - { - static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z]; - return reinterpret_cast(cache_); - } - }; - - template __device__ __host__ inline std::enable_if_t cache() const - { - return target::dispatch(offset); - } - - template __device__ __host__ inline std::enable_if_t cache() const - { - return target::dispatch(); - } - - __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const - { - atom_t tmp[n_element]; - memcpy(tmp, (void *)&a, sizeof(T)); - int j = (z * block.y + y) * block.x + x; -#pragma unroll - for (int i = 0; i < n_element; i++) cache()[i * stride + j] = tmp[i]; - } - - __device__ __host__ inline T load_detail(int x, int y, int z) const - { - atom_t tmp[n_element]; - int j = (z * block.y + y) * block.x + x; -#pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = cache()[i * stride + j]; - T a; - memcpy((void *)&a, tmp, sizeof(T)); - return a; - } - - /** - @brief Dummy instantiation for the host compiler - */ - template struct sync_impl { - void operator()() { } - }; - - /** - @brief Synchronize the cache when on the device - */ - template struct sync_impl { - __device__ inline void operator()() { __syncthreads(); } - }; - - public: - /** - @brief constructor for SharedMemory cache. If no arguments are - pass, then the dimensions are set according to the templates - block_size_y and block_size_z, together with the derived - block_size_x. Otherwise use the block sizes passed into the - constructor. - - @param[in] block Block dimensions for the 3-d shared memory object - @param[in] thread_offset "Perceived" offset from dynamic shared - memory base pointer (used when we have multiple caches in - scope). Need to include block size to actual offset. - */ - constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z), - unsigned int thread_offset = 0) : - block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset) - { - } - - /** - @brief Grab the raw base address to shared memory. - */ - __device__ __host__ inline auto data() const { return reinterpret_cast(cache()); } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] x The x index to use - @param[in] y The y index to use - @param[in] z The z index to use - */ - __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - y = (y == -1) ? tid.y : y; - z = (z == -1) ? tid.z : z; - save_detail(a, x, y, z); - } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] x The x index to use - */ - __device__ __host__ inline void save_x(const T &a, int x = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - save_detail(a, x, tid.y, tid.z); - } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] y The y index to use - */ - __device__ __host__ inline void save_y(const T &a, int y = -1) const - { - auto tid = target::thread_idx(); - y = (y == -1) ? tid.y : y; - save_detail(a, tid.x, y, tid.z); - } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] z The z index to use - */ - __device__ __host__ inline void save_z(const T &a, int z = -1) const - { - auto tid = target::thread_idx(); - z = (z == -1) ? tid.z : z; - save_detail(a, tid.x, tid.y, z); - } - - /** - @brief Load a value from the shared memory cache - @param[in] x The x index to use - @param[in] y The y index to use - @param[in] z The z index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - y = (y == -1) ? tid.y : y; - z = (z == -1) ? tid.z : z; - return load_detail(x, y, z); - } - - /** - @brief Load a vector from the shared memory cache - @param[in] x The x index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load_x(int x = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - return load_detail(x, tid.y, tid.z); - } - - /** - @brief Load a vector from the shared memory cache - @param[in] y The y index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load_y(int y = -1) const - { - auto tid = target::thread_idx(); - y = (y == -1) ? tid.y : y; - return load_detail(tid.x, y, tid.z); - } - - /** - @brief Load a vector from the shared memory cache - @param[in] z The z index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load_z(int z = -1) const - { - auto tid = target::thread_idx(); - z = (z == -1) ? tid.z : z; - return load_detail(tid.x, tid.y, z); - } - - /** - @brief Synchronize the cache - */ - __device__ __host__ void sync() const { target::dispatch(); } - - /** - @brief Cast operator to allow cache objects to be used where T - is expected - */ - __device__ __host__ operator T() const { return load(); } - - /** - @brief Assignment operator to allow cache objects to be used on - the lhs where T is otherwise expected. - */ - __device__ __host__ void operator=(const T &src) const { save(src); } - }; - -} // namespace quda - -// include overloads #include "../generic/shared_memory_cache_helper.h" diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h index bd8d919359..21a61cca45 100644 --- a/include/targets/cuda/shared_memory_helper.h +++ b/include/targets/cuda/shared_memory_helper.h @@ -14,10 +14,10 @@ namespace quda /** @brief Class which is used to allocate and access shared memory. The shared memory is treated as an array of type T, with the - number of elements given by the static member S::size(). The - offset from the beginning of the total shared memory block is - given by the static member O::shared_mem_size(block), or 0 if O - is void. + number of elements given by the call to the static member + S::size(target::block_dim()). The offset from the beginning of + the total shared memory block is given by the static member + O::shared_mem_size(target::block_dim()), or 0 if O is void. */ template class SharedMemory { @@ -56,6 +56,7 @@ namespace quda return target::dispatch(offset); } + public: static constexpr unsigned int get_offset(dim3 block) { unsigned int o = 0; @@ -63,16 +64,16 @@ namespace quda return o; } - public: static constexpr unsigned int shared_mem_size(dim3 block) { - return get_offset(block) + S::size()*sizeof(T); + return get_offset(block) + S::size(block)*sizeof(T); } /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), + size(S::size(target::block_dim())) {} /** @brief Subscripting operator returning a reference to element. diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h index 4fe1bb33f6..cf778afbd5 100644 --- a/include/targets/cuda/thread_array.h +++ b/include/targets/cuda/thread_array.h @@ -1,49 +1,15 @@ #pragma once -#include "shared_memory_cache_helper.h" +//#ifndef _NVHPC_CUDA -namespace quda -{ - -#ifndef _NVHPC_CUDA - - /** - @brief Class that provides indexable per-thread storage. On CUDA - this maps to using assigning each thread a unique window of - shared memory using the SharedMemoryCache object. - */ - template struct thread_array { - SharedMemoryCache, 1, 1, false> device_array; - int offset; - array host_array; - array &array_; - - __device__ __host__ constexpr thread_array() : - offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x - + target::thread_idx().x), - array_(target::is_device() ? *(device_array.data() + offset) : host_array) - { - array_ = array(); // call default constructor - } +//#include "../generic/thread_array.h" - template - __device__ __host__ constexpr thread_array(T first, const Ts... other) : - offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x - + target::thread_idx().x), - array_(target::is_device() ? *(device_array.data() + offset) : host_array) - { - array_ = array {first, other...}; - } +//#else - __device__ __host__ T &operator[](int i) { return array_[i]; } - __device__ __host__ const T &operator[](int i) const { return array_[i]; } - }; - -#else - - template struct thread_array : array { - }; - -#endif +#include +namespace quda +{ + template struct thread_array : array {}; +} -} // namespace quda +//#endif diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h new file mode 100644 index 0000000000..fe90c0e5ed --- /dev/null +++ b/include/targets/generic/helpers.h @@ -0,0 +1,42 @@ +#pragma once + +namespace quda +{ + + template + using atom_t = std::conditional_t>; + + template struct SizeStatic { + static constexpr unsigned int size(dim3 block) { + return N; + } + }; + + template struct SizePerThread { + static constexpr unsigned int size(dim3 block) { + return N * block.x * block.y * block.z; + } + }; + + template struct SizeDims { + static constexpr unsigned int size(dim3 block) { + dim3 dims = D::dims(block); + return dims.x * dims.y * dims.z * N; + } + }; + + struct DimsBlock { + static constexpr dim3 dims(dim3 block) { + return block; + } + }; + + /** + @brief Uniform helper for exposing type T, whether we are dealing + with an instance of T or some wrapper of T + */ + template struct get_type { + using type = T; + }; + +} diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h index 060e3478f8..ddf831e6a4 100644 --- a/include/targets/generic/shared_memory_cache_helper.h +++ b/include/targets/generic/shared_memory_cache_helper.h @@ -1,3 +1,16 @@ +#pragma once + +#include +#include +#include + +/** + @file shared_memory_cache_helper.h + + Helper functionality for aiding the use of the shared memory for + sharing data between threads in a thread block. + */ + /** @file shared_memory_cache_helper.h @brief Convenience overloads to allow SharedMemoryCache objects to @@ -9,46 +22,269 @@ namespace quda { - template - __device__ __host__ inline T operator+(const SharedMemoryCache &a, const T &b) + /** + @brief Class which wraps around a shared memory cache for type T, + where each thread in the thread block stores a unique value in + the cache which any other thread can access. + + This accessor supports both explicit run-time block size and + compile-time sizing. + + * For run-time block size, the constructor should be initialied + with the desired block size. + + * For compile-time block size, no arguments should be passed to + the constructor, and then the second and third template + parameters correspond to the y and z dimensions of the block, + respectively. The x dimension of the block will be set + according the maximum number of threads possible, given these + dimensions. + */ + template + class SharedMemoryCache : SharedMemory, SizeDims)>, O> + { + public: + using value_type = T; + using dims_type = D; + using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one + using Smem = SharedMemory, SizeDims)>, O>; + + private: + using atom_t = atom_t; + static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types"); + + // The number of elements of type atom_t that we break T into for optimal shared-memory access + static constexpr int n_element = sizeof(T) / sizeof(atom_t); + + const dim3 block; + const int stride; + + constexpr Smem smem() const { return *dynamic_cast(this); } + + __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const + { + atom_t tmp[n_element]; + memcpy(tmp, (void *)&a, sizeof(T)); + int j = (z * block.y + y) * block.x + x; +#pragma unroll + for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i]; + } + + __device__ __host__ inline T load_detail(int x, int y, int z) const + { + atom_t tmp[n_element]; + int j = (z * block.y + y) * block.x + x; +#pragma unroll + for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j]; + T a; + memcpy((void *)&a, tmp, sizeof(T)); + return a; + } + + /** + @brief Dummy instantiation for the host compiler + */ + template struct sync_impl { + void operator()() { } + }; + + /** + @brief Synchronize the cache when on the device + */ + template struct sync_impl { + __device__ inline void operator()() { __syncthreads(); } + }; + + public: + using Smem::shared_mem_size; + + /** + @brief constructor for SharedMemory cache. If no arguments are + pass, then the dimensions are set according to the templates + block_size_y and block_size_z, together with the derived + block_size_x. Otherwise use the block sizes passed into the + constructor. + + @param[in] block Block dimensions for the 3-d shared memory object + @param[in] thread_offset "Perceived" offset from dynamic shared + memory base pointer (used when we have multiple caches in + scope). Need to include block size to actual offset. + */ + constexpr SharedMemoryCache() : + block(D::dims(target::block_dim())), stride(block.x * block.y * block.z) + { + static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims::size(dim3{8,8,8})*sizeof(T)); + } + + /** + @brief Grab the raw base address to shared memory. + */ + __device__ __host__ inline auto data() const { + return reinterpret_cast(&smem()[0]); + } + + /** + @brief Save the value into the 3-d shared memory cache. + @param[in] a The value to store in the shared memory cache + @param[in] x The x index to use + @param[in] y The y index to use + @param[in] z The z index to use + */ + __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const + { + auto tid = target::thread_idx(); + x = (x == -1) ? tid.x : x; + y = (y == -1) ? tid.y : y; + z = (z == -1) ? tid.z : z; + save_detail(a, x, y, z); + } + + /** + @brief Save the value into the 3-d shared memory cache. + @param[in] a The value to store in the shared memory cache + @param[in] x The x index to use + */ + __device__ __host__ inline void save_x(const T &a, int x = -1) const + { + auto tid = target::thread_idx(); + x = (x == -1) ? tid.x : x; + save_detail(a, x, tid.y, tid.z); + } + + /** + @brief Save the value into the 3-d shared memory cache. + @param[in] a The value to store in the shared memory cache + @param[in] y The y index to use + */ + __device__ __host__ inline void save_y(const T &a, int y = -1) const + { + auto tid = target::thread_idx(); + y = (y == -1) ? tid.y : y; + save_detail(a, tid.x, y, tid.z); + } + + /** + @brief Save the value into the 3-d shared memory cache. + @param[in] a The value to store in the shared memory cache + @param[in] z The z index to use + */ + __device__ __host__ inline void save_z(const T &a, int z = -1) const + { + auto tid = target::thread_idx(); + z = (z == -1) ? tid.z : z; + save_detail(a, tid.x, tid.y, z); + } + + /** + @brief Load a value from the shared memory cache + @param[in] x The x index to use + @param[in] y The y index to use + @param[in] z The z index to use + @return The value at coordinates (x,y,z) + */ + __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const + { + auto tid = target::thread_idx(); + x = (x == -1) ? tid.x : x; + y = (y == -1) ? tid.y : y; + z = (z == -1) ? tid.z : z; + return load_detail(x, y, z); + } + + /** + @brief Load a vector from the shared memory cache + @param[in] x The x index to use + @return The value at coordinates (x,y,z) + */ + __device__ __host__ inline T load_x(int x = -1) const + { + auto tid = target::thread_idx(); + x = (x == -1) ? tid.x : x; + return load_detail(x, tid.y, tid.z); + } + + /** + @brief Load a vector from the shared memory cache + @param[in] y The y index to use + @return The value at coordinates (x,y,z) + */ + __device__ __host__ inline T load_y(int y = -1) const + { + auto tid = target::thread_idx(); + y = (y == -1) ? tid.y : y; + return load_detail(tid.x, y, tid.z); + } + + /** + @brief Load a vector from the shared memory cache + @param[in] z The z index to use + @return The value at coordinates (x,y,z) + */ + __device__ __host__ inline T load_z(int z = -1) const + { + auto tid = target::thread_idx(); + z = (z == -1) ? tid.z : z; + return load_detail(tid.x, tid.y, z); + } + + /** + @brief Synchronize the cache + */ + __device__ __host__ void sync() const { target::dispatch(); } + + /** + @brief Cast operator to allow cache objects to be used where T + is expected + */ + __device__ __host__ operator T() const { return load(); } + + /** + @brief Assignment operator to allow cache objects to be used on + the lhs where T is otherwise expected. + */ + __device__ __host__ void operator=(const T &src) const { save(src); } + }; + + template + __device__ __host__ inline T operator+(const SharedMemoryCache &a, const T &b) { return static_cast(a) + b; } - template - __device__ __host__ inline T operator+(const T &a, const SharedMemoryCache &b) + template + __device__ __host__ inline T operator+(const T &a, const SharedMemoryCache &b) { return a + static_cast(b); } - template - __device__ __host__ inline T operator-(const SharedMemoryCache &a, const T &b) + template + __device__ __host__ inline T operator-(const SharedMemoryCache &a, const T &b) { return static_cast(a) - b; } - template - __device__ __host__ inline T operator-(const T &a, const SharedMemoryCache &b) + template + __device__ __host__ inline T operator-(const T &a, const SharedMemoryCache &b) { return a - static_cast(b); } - template - __device__ __host__ inline auto operator+=(SharedMemoryCache &a, const T &b) + template + __device__ __host__ inline auto operator+=(SharedMemoryCache &a, const T &b) { a.save(static_cast(a) + b); return a; } - template - __device__ __host__ inline auto operator-=(SharedMemoryCache &a, const T &b) + template + __device__ __host__ inline auto operator-=(SharedMemoryCache &a, const T &b) { a.save(static_cast(a) - b); return a; } - template - __device__ __host__ inline auto conj(const SharedMemoryCache &a) + template + __device__ __host__ inline auto conj(const SharedMemoryCache &a) { return conj(static_cast(a)); } @@ -57,12 +293,9 @@ namespace quda @brief Uniform helper for exposing type T, whether we are dealing with an instance of T or SharedMemoryCache */ - template struct get_type { - using type = T; - }; template struct get_type< - T, std::enable_if_t>>> { + T, std::enable_if_t>>> { using type = typename T::value_type; }; diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h new file mode 100644 index 0000000000..3bbb739964 --- /dev/null +++ b/include/targets/generic/thread_array.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include +#include + +namespace quda +{ + + /** + @brief Class that provides indexable per-thread storage. On CUDA + this maps to using assigning each thread a unique window of + shared memory using the SharedMemoryCache object. + */ + template + class thread_array : SharedMemory, SizePerThread<1>, O> + { + int offset; + using Smem = SharedMemory, SizePerThread<1>, O>; + constexpr Smem smem() const { return *dynamic_cast(this); } + array &data() const { return smem()[offset]; } + + public: + __device__ __host__ constexpr thread_array() + { + offset = target::thread_idx_linear<3>(); + data() = array(); // call default constructor + } + + template + __device__ __host__ constexpr thread_array(T first, const Ts... other) + { + offset = target::thread_idx_linear<3>(); + data() = array {first, other...}; + } + + __device__ __host__ T &operator[](int i) { return data()[i]; } + __device__ __host__ const T &operator[](int i) const { return data()[i]; } + }; + +} // namespace quda diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index 8df61aebc4..3b14564bec 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -1,8 +1,8 @@ #pragma once +#include #include #include -#include /** @file thread_local_cache.h @@ -14,25 +14,19 @@ namespace quda { - template - using atom_t = std::conditional_t>; - - template struct SizeStatic { - static constexpr unsigned int size() { return N; } - }; - /** @brief Class for threads to store a unique value, or array of values, which can use shared memory for optimization purposes. */ - template class ThreadLocalCache : SharedMemory, SizeStatic, O> + template class ThreadLocalCache : + SharedMemory, SizePerThread)>, O> { public: using value_type = T; - using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one static constexpr int N = N_; // size of array, 0 means to behave like T instead of array + using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one static constexpr int len = std::max(1,N); // actual number of elements to store - using Smem = SharedMemory, SizeStatic, O>; + using Smem = SharedMemory, SizePerThread)>, O>; private: using atom_t = atom_t; @@ -65,23 +59,15 @@ namespace quda return a; } - static constexpr unsigned int get_offset(dim3 block) - { - unsigned int o = 0; - if constexpr (!std::is_same_v) { o = O::shared_mem_size(block); } - return o; - } - public: - static constexpr unsigned int shared_mem_size(dim3 block) - { - return get_offset(block) + len * block.x * block.y * block.z * sizeof(T); - } + using Smem::shared_mem_size; /** @brief Constructor for ThreadLocalCache. */ - constexpr ThreadLocalCache() : stride(target::block_size<3>()) {} + constexpr ThreadLocalCache() : stride(target::block_size<3>()) { + static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizePerThread::size(dim3{8,8,8})*sizeof(T)); + } /** @brief Save the value into the thread local cache. Used when N==0 so cache acts like single object. From f2419e14a1b0185690582cac62f5154971d0e469 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 8 Aug 2023 23:30:34 -0500 Subject: [PATCH 09/27] remove unused parameter --- include/targets/cuda/thread_array.h | 42 ++++++++++++++++++++++++++--- include/targets/generic/helpers.h | 2 +- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h index cf778afbd5..c0c7b36929 100644 --- a/include/targets/cuda/thread_array.h +++ b/include/targets/cuda/thread_array.h @@ -1,10 +1,46 @@ #pragma once -//#ifndef _NVHPC_CUDA +#ifndef _NVHPC_CUDA //#include "../generic/thread_array.h" +#include +//#include +#include +#include + +namespace quda +{ + template + struct thread_array : SharedMemory, SizePerThread<1>, O> { + int offset; + array host_array; + array &array_; + using Smem = SharedMemory, SizePerThread<1>, O>; + constexpr Smem smem() const { return *dynamic_cast(this); } + + __device__ __host__ constexpr thread_array() : + offset(target::thread_idx_linear<3>()), + array_(*(&smem()[0] + offset)) + //array_(target::is_device() ? *(&smem()[0] + offset) : host_array) + { + array_ = array(); // call default constructor + } + + template + __device__ __host__ constexpr thread_array(T first, const Ts... other) : + offset(target::thread_idx_linear<3>()), + array_(*(&smem()[0] + offset)) + //array_(target::is_device() ? *(&smem()[0] + offset) : host_array) + { + array_ = array {first, other...}; + } + + __device__ __host__ T &operator[](int i) { return array_[i]; } + __device__ __host__ const T &operator[](int i) const { return array_[i]; } + }; +} -//#else +#else #include namespace quda @@ -12,4 +48,4 @@ namespace quda template struct thread_array : array {}; } -//#endif +#endif diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h index fe90c0e5ed..4c35347e60 100644 --- a/include/targets/generic/helpers.h +++ b/include/targets/generic/helpers.h @@ -7,7 +7,7 @@ namespace quda using atom_t = std::conditional_t>; template struct SizeStatic { - static constexpr unsigned int size(dim3 block) { + static constexpr unsigned int size(dim3) { return N; } }; From dd7b975844874c34bf0a41988ceb3d401e4856c9 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 11 Aug 2023 21:08:16 -0500 Subject: [PATCH 10/27] update thread_array --- include/targets/cuda/thread_array.h | 38 +------------------------- include/targets/generic/thread_array.h | 19 ++++++------- 2 files changed, 10 insertions(+), 47 deletions(-) diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h index c0c7b36929..f5e965576d 100644 --- a/include/targets/cuda/thread_array.h +++ b/include/targets/cuda/thread_array.h @@ -2,43 +2,7 @@ #ifndef _NVHPC_CUDA -//#include "../generic/thread_array.h" -#include -//#include -#include -#include - -namespace quda -{ - template - struct thread_array : SharedMemory, SizePerThread<1>, O> { - int offset; - array host_array; - array &array_; - using Smem = SharedMemory, SizePerThread<1>, O>; - constexpr Smem smem() const { return *dynamic_cast(this); } - - __device__ __host__ constexpr thread_array() : - offset(target::thread_idx_linear<3>()), - array_(*(&smem()[0] + offset)) - //array_(target::is_device() ? *(&smem()[0] + offset) : host_array) - { - array_ = array(); // call default constructor - } - - template - __device__ __host__ constexpr thread_array(T first, const Ts... other) : - offset(target::thread_idx_linear<3>()), - array_(*(&smem()[0] + offset)) - //array_(target::is_device() ? *(&smem()[0] + offset) : host_array) - { - array_ = array {first, other...}; - } - - __device__ __host__ T &operator[](int i) { return array_[i]; } - __device__ __host__ const T &operator[](int i) const { return array_[i]; } - }; -} +#include "../generic/thread_array.h" #else diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h index 3bbb739964..5bbb243391 100644 --- a/include/targets/generic/thread_array.h +++ b/include/targets/generic/thread_array.h @@ -15,27 +15,26 @@ namespace quda template class thread_array : SharedMemory, SizePerThread<1>, O> { - int offset; using Smem = SharedMemory, SizePerThread<1>, O>; constexpr Smem smem() const { return *dynamic_cast(this); } - array &data() const { return smem()[offset]; } + array &array_; public: - __device__ __host__ constexpr thread_array() + __device__ __host__ constexpr thread_array() : + array_(smem()[target::thread_idx_linear<3>()]) { - offset = target::thread_idx_linear<3>(); - data() = array(); // call default constructor + array_ = array(); // call default constructor } template - __device__ __host__ constexpr thread_array(T first, const Ts... other) + __device__ __host__ constexpr thread_array(T first, const Ts... other) : + array_(smem()[target::thread_idx_linear<3>()]) { - offset = target::thread_idx_linear<3>(); - data() = array {first, other...}; + array_ = array {first, other...}; } - __device__ __host__ T &operator[](int i) { return data()[i]; } - __device__ __host__ const T &operator[](int i) const { return data()[i]; } + __device__ __host__ T &operator[](int i) { return array_[i]; } + __device__ __host__ const T &operator[](int i) const { return array_[i]; } }; } // namespace quda From 72cc52b9df373b08e008c769757f8cfccc3d8f83 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Wed, 16 Aug 2023 10:10:47 -0500 Subject: [PATCH 11/27] update remaining shared memory uses --- include/kernels/block_transpose.cuh | 11 ++++++- include/kernels/coarse_op_kernel.cuh | 18 +++++++++--- include/kernels/color_spinor_pack.cuh | 29 +++++++++++-------- include/targets/generic/helpers.h | 7 +++++ .../generic/shared_memory_cache_helper.h | 21 ++++++++++---- 5 files changed, 63 insertions(+), 23 deletions(-) diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh index 153c2c9695..2b41f5c520 100644 --- a/include/kernels/block_transpose.cuh +++ b/include/kernels/block_transpose.cuh @@ -47,6 +47,14 @@ namespace quda constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { } static constexpr const char *filename() { return KERNEL_FILE; } + struct Dims { + static constexpr dim3 dims(dim3 block) { + block.x += 1; + block.z = 1; + return block; + } + }; + /** @brief Transpose between the two different orders of batched colorspinor fields: - B: nVec -> spatial/N -> spin/color -> N, where N is for that in floatN @@ -60,7 +68,8 @@ namespace quda int parity = parity_color / Arg::nColor; using color_spinor_t = ColorSpinor; - SharedMemoryCache cache({target::block_dim().x + 1, target::block_dim().y, 1}); + //SharedMemoryCache cache({target::block_dim().x + 1, target::block_dim().y, 1}); + SharedMemoryCache cache; int x_offset = target::block_dim().x * target::block_idx().x; int v_offset = target::block_dim().y * target::block_idx().y; diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh index b63bf7f435..be411eb68c 100644 --- a/include/kernels/coarse_op_kernel.cuh +++ b/include/kernels/coarse_op_kernel.cuh @@ -10,6 +10,7 @@ #include #include #include +#include namespace quda { @@ -1387,14 +1388,21 @@ namespace quda { }; template <> struct storeCoarseSharedAtomic_impl { + template using CacheT = + complex[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; + template using Cache = SharedMemoryCache,DimsStatic<2,1,1>>; + template inline __device__ void operator()(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity, int i0, int j0, int parity, const Pack &pack, const Arg &arg) { using real = typename Arg::Float; using TileType = typename Arg::vuvTileType; const int dim_index = arg.dim_index % arg.Y_atomic.geometry; - __shared__ complex X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; - __shared__ complex Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; + //__shared__ complex X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; + //__shared__ complex Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; + Cache cache; + auto &X = cache.data()[0]; + auto &Y = cache.data()[1]; int x_ = coarse_x_cb % arg.aggregates_per_block; int tx = virtualThreadIdx(arg); @@ -1416,7 +1424,8 @@ namespace quda { } } - __syncthreads(); + //__syncthreads(); + cache.sync(); #pragma unroll for (int i = 0; i < TileType::M; i++) { @@ -1445,7 +1454,8 @@ namespace quda { } } - __syncthreads(); + //__syncthreads(); + cache.sync(); if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) { diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh index 0f4c2b4466..2cc489181a 100644 --- a/include/kernels/color_spinor_pack.cuh +++ b/include/kernels/color_spinor_pack.cuh @@ -171,27 +171,32 @@ namespace quda { } }; - template - struct DimsPadX { - static constexpr dim3 dims(dim3 block) { - if (is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); - return block; - } - }; - template <> struct site_max { + template + struct DimsPadX { + static constexpr int Ms = spins_per_thread(Arg::nSpin); + static constexpr int Mc = colors_per_thread(Arg::nColor); + static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); + static constexpr dim3 dims(dim3 block) { + if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); + block.y = color_spin_threads; // state the y block since we know it at compile time + return block; + } + }; + template __device__ inline auto operator()(typename Arg::real thread_max, Arg &) { using real = typename Arg::real; - constexpr int Ms = spins_per_thread(Arg::nSpin); - constexpr int Mc = colors_per_thread(Arg::nColor); - constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); + //constexpr int Ms = spins_per_thread(Arg::nSpin); + //constexpr int Mc = colors_per_thread(Arg::nColor); + //constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); + constexpr int color_spin_threads = DimsPadX::color_spin_threads; //auto block = target::block_dim(); // pad the shared block size to avoid bank conflicts for native ordering //if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); //block.y = color_spin_threads; // state the y block since we know it at compile time //SharedMemoryCache cache(block); - SharedMemoryCache> cache; + SharedMemoryCache> cache; cache.save(thread_max); cache.sync(); real this_site_max = static_cast(0); diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h index 4c35347e60..2ab4efbda4 100644 --- a/include/targets/generic/helpers.h +++ b/include/targets/generic/helpers.h @@ -31,6 +31,13 @@ namespace quda } }; + template + struct DimsStatic { + static constexpr dim3 dims(dim3 block) { + return dim3(x,y,z); + } + }; + /** @brief Uniform helper for exposing type T, whether we are dealing with an instance of T or some wrapper of T diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h index ddf831e6a4..f9b7eccd77 100644 --- a/include/targets/generic/shared_memory_cache_helper.h +++ b/include/targets/generic/shared_memory_cache_helper.h @@ -56,6 +56,9 @@ namespace quda // The number of elements of type atom_t that we break T into for optimal shared-memory access static constexpr int n_element = sizeof(T) / sizeof(atom_t); + // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array) + template using maybeT = std::conditional_t,T,void>; + const dim3 block; const int stride; @@ -70,7 +73,8 @@ namespace quda for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i]; } - __device__ __host__ inline T load_detail(int x, int y, int z) const + template + __device__ __host__ inline maybeT load_detail(int x, int y, int z) const { atom_t tmp[n_element]; int j = (z * block.y + y) * block.x + x; @@ -182,7 +186,8 @@ namespace quda @param[in] z The z index to use @return The value at coordinates (x,y,z) */ - __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const + template + __device__ __host__ inline maybeT load(int x = -1, int y = -1, int z = -1) const { auto tid = target::thread_idx(); x = (x == -1) ? tid.x : x; @@ -196,7 +201,8 @@ namespace quda @param[in] x The x index to use @return The value at coordinates (x,y,z) */ - __device__ __host__ inline T load_x(int x = -1) const + template + __device__ __host__ inline maybeT load_x(int x = -1) const { auto tid = target::thread_idx(); x = (x == -1) ? tid.x : x; @@ -208,7 +214,8 @@ namespace quda @param[in] y The y index to use @return The value at coordinates (x,y,z) */ - __device__ __host__ inline T load_y(int y = -1) const + template + __device__ __host__ inline maybeT load_y(int y = -1) const { auto tid = target::thread_idx(); y = (y == -1) ? tid.y : y; @@ -220,7 +227,8 @@ namespace quda @param[in] z The z index to use @return The value at coordinates (x,y,z) */ - __device__ __host__ inline T load_z(int z = -1) const + template + __device__ __host__ inline maybeT load_z(int z = -1) const { auto tid = target::thread_idx(); z = (z == -1) ? tid.z : z; @@ -236,7 +244,8 @@ namespace quda @brief Cast operator to allow cache objects to be used where T is expected */ - __device__ __host__ operator T() const { return load(); } + template + __device__ __host__ operator maybeT() const { return load(); } /** @brief Assignment operator to allow cache objects to be used on From 13f6701af4c7d27108ecfa6224923f49ad94691b Mon Sep 17 00:00:00 2001 From: James Osborn Date: Wed, 16 Aug 2023 12:21:07 -0500 Subject: [PATCH 12/27] fix HIP target --- .../targets/hip/shared_memory_cache_helper.h | 294 ------------------ include/targets/hip/thread_array.h | 41 +-- 2 files changed, 1 insertion(+), 334 deletions(-) diff --git a/include/targets/hip/shared_memory_cache_helper.h b/include/targets/hip/shared_memory_cache_helper.h index 7c7c0a1b28..73be0cd01b 100644 --- a/include/targets/hip/shared_memory_cache_helper.h +++ b/include/targets/hip/shared_memory_cache_helper.h @@ -1,295 +1 @@ -#pragma once - -#include -#include - -/** - @file shared_memory_cache_helper.h - - Helper functionality for aiding the use of the shared memory for - sharing data between threads in a thread block. - */ - -namespace quda -{ - - /** - @brief Class which wraps around a shared memory cache for type T, - where each thread in the thread block stores a unique value in - the cache which any other thread can access. - - This accessor supports both explicit run-time block size and - compile-time sizing. - - * For run-time block size, the constructor should be initialied - with the desired block size. - - * For compile-time block size, no arguments should be passed to - the constructor, and then the second and third template - parameters correspond to the y and z dimensions of the block, - respectively. The x dimension of the block will be set - according the maximum number of threads possible, given these - dimensions. - */ - template class SharedMemoryCache - { - public: - using value_type = T; - static constexpr int block_size_y = block_size_y_; - static constexpr int block_size_z = block_size_z_; - static constexpr bool dynamic = dynamic_; - - private: - /** maximum number of threads in x given the y and z block sizes */ - static constexpr int block_size_x = device::max_block_size(); - - using atom_t = std::conditional_t>; - static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types"); - - // The number of elements of type atom_t that we break T into for optimal shared-memory access - static constexpr int n_element = sizeof(T) / sizeof(atom_t); - - const dim3 block; - const int stride; - const unsigned int offset = 0; // dynamic offset in bytes - - /** - @brief This is a dummy instantiation for the host compiler - */ - template struct cache_dynamic { - atom_t *operator()(unsigned) - { - static atom_t *cache_; - return reinterpret_cast(cache_); - } - }; - - /** - @brief This is the handle to the shared memory, dynamic specialization - @return Shared memory pointer - */ - template struct cache_dynamic { - __device__ inline atom_t *operator()(unsigned int offset) - { - extern __shared__ int cache_[]; - return reinterpret_cast(reinterpret_cast(cache_) + offset); - } - }; - - /** - @brief This is a dummy instantiation for the host compiler - */ - template struct cache_static { - atom_t *operator()() - { - static atom_t *cache_; - return reinterpret_cast(cache_); - } - }; - - /** - @brief This is the handle to the shared memory, static specialization - @return Shared memory pointer - */ - template struct cache_static { - __device__ inline atom_t *operator()() - { - static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z]; - return reinterpret_cast(cache_); - } - }; - - template __device__ __host__ inline std::enable_if_t cache() const - { - return target::dispatch(offset); - } - - template __device__ __host__ inline std::enable_if_t cache() const - { - return target::dispatch(); - } - - __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const - { - atom_t tmp[n_element]; - memcpy(tmp, (void *)&a, sizeof(T)); - int j = (z * block.y + y) * block.x + x; -#pragma unroll - for (int i = 0; i < n_element; i++) cache()[i * stride + j] = tmp[i]; - } - - __device__ __host__ inline T load_detail(int x, int y, int z) const - { - atom_t tmp[n_element]; - int j = (z * block.y + y) * block.x + x; -#pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = cache()[i * stride + j]; - T a; - memcpy((void *)&a, tmp, sizeof(T)); - return a; - } - - /** - @brief Dummy instantiation for the host compiler - */ - template struct sync_impl { - void operator()() { } - }; - - /** - @brief Synchronize the cache when on the device - */ - template struct sync_impl { - __device__ inline void operator()() { __syncthreads(); } - }; - - public: - /** - @brief constructor for SharedMemory cache. If no arguments are - pass, then the dimensions are set according to the templates - block_size_y and block_size_z, together with the derived - block_size_x. Otherwise use the block sizes passed into the - constructor. - - @param[in] block Block dimensions for the 3-d shared memory object - @param[in] thread_offset "Perceived" offset from dynamic shared - memory base pointer (used when we have multiple caches in - scope). Need to include block size to actual offset. - */ - constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z), - unsigned int thread_offset = 0) : - block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset) - { - } - - /** - @brief Grab the raw base address to shared memory. - */ - __device__ __host__ inline auto data() const { return reinterpret_cast(cache()); } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] x The x index to use - @param[in] y The y index to use - @param[in] z The z index to use - */ - __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - y = (y == -1) ? tid.y : y; - z = (z == -1) ? tid.z : z; - save_detail(a, x, y, z); - } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] x The x index to use - */ - __device__ __host__ inline void save_x(const T &a, int x = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - save_detail(a, x, tid.y, tid.z); - } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] y The y index to use - */ - __device__ __host__ inline void save_y(const T &a, int y = -1) const - { - auto tid = target::thread_idx(); - y = (y == -1) ? tid.y : y; - save_detail(a, tid.x, y, tid.z); - } - - /** - @brief Save the value into the 3-d shared memory cache. - @param[in] a The value to store in the shared memory cache - @param[in] z The z index to use - */ - __device__ __host__ inline void save_z(const T &a, int z = -1) const - { - auto tid = target::thread_idx(); - z = (z == -1) ? tid.z : z; - save_detail(a, tid.x, tid.y, z); - } - - /** - @brief Load a value from the shared memory cache - @param[in] x The x index to use - @param[in] y The y index to use - @param[in] z The z index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - y = (y == -1) ? tid.y : y; - z = (z == -1) ? tid.z : z; - return load_detail(x, y, z); - } - - /** - @brief Load a vector from the shared memory cache - @param[in] x The x index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load_x(int x = -1) const - { - auto tid = target::thread_idx(); - x = (x == -1) ? tid.x : x; - return load_detail(x, tid.y, tid.z); - } - - /** - @brief Load a vector from the shared memory cache - @param[in] y The y index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load_y(int y = -1) const - { - auto tid = target::thread_idx(); - y = (y == -1) ? tid.y : y; - return load_detail(tid.x, y, tid.z); - } - - /** - @brief Load a vector from the shared memory cache - @param[in] z The z index to use - @return The value at coordinates (x,y,z) - */ - __device__ __host__ inline T load_z(int z = -1) const - { - auto tid = target::thread_idx(); - z = (z == -1) ? tid.z : z; - return load_detail(tid.x, tid.y, z); - } - - /** - @brief Synchronize the cache - */ - __device__ __host__ void sync() const { target::dispatch(); } - - /** - @brief Cast operator to allow cache objects to be used where T - is expected - */ - __device__ __host__ operator T() const { return load(); } - - /** - @brief Assignment operator to allow cache objects to be used on - the lhs where T is otherwise expected. - */ - __device__ __host__ void operator=(const T &src) const { save(src); } - }; - -} // namespace quda - -// include overloads #include "../generic/shared_memory_cache_helper.h" diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h index 77751adf1b..24f986cc26 100644 --- a/include/targets/hip/thread_array.h +++ b/include/targets/hip/thread_array.h @@ -1,40 +1 @@ -#pragma once - -#include "shared_memory_cache_helper.h" - -namespace quda -{ - - /** - @brief Class that provides indexable per-thread storage. On HIP - this maps to using assigning each thread a unique window of - shared memory using the SharedMemoryCache object. - */ - template struct thread_array { - SharedMemoryCache, 1, 1, false> device_array; - int offset; - array host_array; - array &array_; - - __device__ __host__ constexpr thread_array() : - offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x - + target::thread_idx().x), - array_(target::is_device() ? *(device_array.data() + offset) : host_array) - { - array_ = array(); // call default constructor - } - - template - __device__ __host__ constexpr thread_array(T first, const Ts... other) : - offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x - + target::thread_idx().x), - array_(target::is_device() ? *(device_array.data() + offset) : host_array) - { - array_ = array {first, other...}; - } - - __device__ __host__ T &operator[](int i) { return array_[i]; } - __device__ __host__ const T &operator[](int i) const { return array_[i]; } - }; - -} // namespace quda +#include "../generic/thread_array.h" From 7df0d93027934b73b1978b301b25877ab5dd18f5 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Wed, 16 Aug 2023 12:40:02 -0500 Subject: [PATCH 13/27] fix HIP build --- include/targets/hip/shared_memory_helper.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h index bd8d919359..21a61cca45 100644 --- a/include/targets/hip/shared_memory_helper.h +++ b/include/targets/hip/shared_memory_helper.h @@ -14,10 +14,10 @@ namespace quda /** @brief Class which is used to allocate and access shared memory. The shared memory is treated as an array of type T, with the - number of elements given by the static member S::size(). The - offset from the beginning of the total shared memory block is - given by the static member O::shared_mem_size(block), or 0 if O - is void. + number of elements given by the call to the static member + S::size(target::block_dim()). The offset from the beginning of + the total shared memory block is given by the static member + O::shared_mem_size(target::block_dim()), or 0 if O is void. */ template class SharedMemory { @@ -56,6 +56,7 @@ namespace quda return target::dispatch(offset); } + public: static constexpr unsigned int get_offset(dim3 block) { unsigned int o = 0; @@ -63,16 +64,16 @@ namespace quda return o; } - public: static constexpr unsigned int shared_mem_size(dim3 block) { - return get_offset(block) + S::size()*sizeof(T); + return get_offset(block) + S::size(block)*sizeof(T); } /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), + size(S::size(target::block_dim())) {} /** @brief Subscripting operator returning a reference to element. From c73d2b4ae4b7a001aa7e782bf6ccaeb90c9a6f36 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 18 Aug 2023 22:41:07 -0500 Subject: [PATCH 14/27] update shared memory object --- include/targets/cuda/shared_memory_helper.h | 2 ++ include/targets/generic/shared_memory_cache_helper.h | 8 +++++--- include/targets/generic/thread_array.h | 4 +++- include/targets/generic/thread_local_cache.h | 6 +++--- include/targets/hip/shared_memory_helper.h | 2 ++ 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h index 21a61cca45..b7dacbb9cf 100644 --- a/include/targets/cuda/shared_memory_helper.h +++ b/include/targets/cuda/shared_memory_helper.h @@ -75,6 +75,8 @@ namespace quda constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size(target::block_dim())) {} + constexpr auto smem() const { return *this; } + /** @brief Subscripting operator returning a reference to element. @param[in] i The index to use. diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h index f9b7eccd77..149a7016f3 100644 --- a/include/targets/generic/shared_memory_cache_helper.h +++ b/include/targets/generic/shared_memory_cache_helper.h @@ -48,6 +48,7 @@ namespace quda using dims_type = D; using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one using Smem = SharedMemory, SizeDims)>, O>; + using Smem::shared_mem_size; private: using atom_t = atom_t; @@ -62,7 +63,10 @@ namespace quda const dim3 block; const int stride; - constexpr Smem smem() const { return *dynamic_cast(this); } + //constexpr Smem smem() const { return *dynamic_cast(this); } + using Smem::smem; + //constexpr Smem smem() const { return Smem::smem(); } + //constexpr Smem smem() const { return *Smem::smemp(); } __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const { @@ -100,8 +104,6 @@ namespace quda }; public: - using Smem::shared_mem_size; - /** @brief constructor for SharedMemory cache. If no arguments are pass, then the dimensions are set according to the templates diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h index 5bbb243391..583468c036 100644 --- a/include/targets/generic/thread_array.h +++ b/include/targets/generic/thread_array.h @@ -16,9 +16,11 @@ namespace quda class thread_array : SharedMemory, SizePerThread<1>, O> { using Smem = SharedMemory, SizePerThread<1>, O>; - constexpr Smem smem() const { return *dynamic_cast(this); } array &array_; + //constexpr Smem smem() const { return *dynamic_cast(this); } + using Smem::smem; + public: __device__ __host__ constexpr thread_array() : array_(smem()[target::thread_idx_linear<3>()]) diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index 3b14564bec..545a365e25 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -27,6 +27,7 @@ namespace quda using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one static constexpr int len = std::max(1,N); // actual number of elements to store using Smem = SharedMemory, SizePerThread)>, O>; + using Smem::shared_mem_size; private: using atom_t = atom_t; @@ -37,7 +38,8 @@ namespace quda const int stride; - constexpr Smem smem() const { return *dynamic_cast(this); } + //constexpr Smem smem() const { return *dynamic_cast(this); } + using Smem::smem; __device__ __host__ inline void save_detail(const T &a, const int k) const { @@ -60,8 +62,6 @@ namespace quda } public: - using Smem::shared_mem_size; - /** @brief Constructor for ThreadLocalCache. */ diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h index 21a61cca45..b7dacbb9cf 100644 --- a/include/targets/hip/shared_memory_helper.h +++ b/include/targets/hip/shared_memory_helper.h @@ -75,6 +75,8 @@ namespace quda constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size(target::block_dim())) {} + constexpr auto smem() const { return *this; } + /** @brief Subscripting operator returning a reference to element. @param[in] i The index to use. From bab726ebc838f6c94bb514960665e0e984bd6072 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Sat, 19 Aug 2023 14:29:51 -0500 Subject: [PATCH 15/27] fix clang build --- include/targets/generic/helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h index 2ab4efbda4..a08cefa84b 100644 --- a/include/targets/generic/helpers.h +++ b/include/targets/generic/helpers.h @@ -33,7 +33,7 @@ namespace quda template struct DimsStatic { - static constexpr dim3 dims(dim3 block) { + static constexpr dim3 dims(dim3) { return dim3(x,y,z); } }; From b57658280e970f79e28870b1a1bf1ad5e1e620e1 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 31 Aug 2023 22:02:38 -0500 Subject: [PATCH 16/27] code cleanup --- include/kernels/block_transpose.cuh | 5 +- include/kernels/coarse_op_kernel.cuh | 4 -- include/kernels/color_spinor_pack.cuh | 16 ++--- include/kernels/gauge_stout.cuh | 4 +- include/kernels/gauge_utils.cuh | 1 - include/kernels/gauge_wilson_flow.cuh | 4 +- include/kernels/hisq_paths_force.cuh | 1 - include/targets/cuda/shared_memory_helper.h | 21 ++++-- include/targets/cuda/thread_array.h | 1 + include/targets/generic/helpers.h | 18 +++++ .../generic/shared_memory_cache_helper.h | 69 ++++++------------- include/targets/generic/thread_array.h | 15 ++-- include/targets/generic/thread_local_cache.h | 39 +++++++---- include/targets/hip/shared_memory_helper.h | 21 ++++-- 14 files changed, 110 insertions(+), 109 deletions(-) diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh index 2b41f5c520..3c54345a6c 100644 --- a/include/kernels/block_transpose.cuh +++ b/include/kernels/block_transpose.cuh @@ -47,7 +47,7 @@ namespace quda constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { } static constexpr const char *filename() { return KERNEL_FILE; } - struct Dims { + struct CacheDims { static constexpr dim3 dims(dim3 block) { block.x += 1; block.z = 1; @@ -68,8 +68,7 @@ namespace quda int parity = parity_color / Arg::nColor; using color_spinor_t = ColorSpinor; - //SharedMemoryCache cache({target::block_dim().x + 1, target::block_dim().y, 1}); - SharedMemoryCache cache; + SharedMemoryCache cache; int x_offset = target::block_dim().x * target::block_idx().x; int v_offset = target::block_dim().y * target::block_idx().y; diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh index be411eb68c..04c05fda4e 100644 --- a/include/kernels/coarse_op_kernel.cuh +++ b/include/kernels/coarse_op_kernel.cuh @@ -1398,8 +1398,6 @@ namespace quda { using real = typename Arg::Float; using TileType = typename Arg::vuvTileType; const int dim_index = arg.dim_index % arg.Y_atomic.geometry; - //__shared__ complex X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; - //__shared__ complex Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; Cache cache; auto &X = cache.data()[0]; auto &Y = cache.data()[1]; @@ -1424,7 +1422,6 @@ namespace quda { } } - //__syncthreads(); cache.sync(); #pragma unroll @@ -1454,7 +1451,6 @@ namespace quda { } } - //__syncthreads(); cache.sync(); if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) { diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh index 2cc489181a..a67d86ded2 100644 --- a/include/kernels/color_spinor_pack.cuh +++ b/include/kernels/color_spinor_pack.cuh @@ -172,12 +172,12 @@ namespace quda { }; template <> struct site_max { - template - struct DimsPadX { + template struct CacheDims { static constexpr int Ms = spins_per_thread(Arg::nSpin); static constexpr int Mc = colors_per_thread(Arg::nColor); static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); static constexpr dim3 dims(dim3 block) { + // pad the shared block size to avoid bank conflicts for native ordering if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); block.y = color_spin_threads; // state the y block since we know it at compile time return block; @@ -187,16 +187,8 @@ namespace quda { template __device__ inline auto operator()(typename Arg::real thread_max, Arg &) { using real = typename Arg::real; - //constexpr int Ms = spins_per_thread(Arg::nSpin); - //constexpr int Mc = colors_per_thread(Arg::nColor); - //constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); - constexpr int color_spin_threads = DimsPadX::color_spin_threads; - //auto block = target::block_dim(); - // pad the shared block size to avoid bank conflicts for native ordering - //if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); - //block.y = color_spin_threads; // state the y block since we know it at compile time - //SharedMemoryCache cache(block); - SharedMemoryCache> cache; + constexpr int color_spin_threads = CacheDims::color_spin_threads; + SharedMemoryCache> cache; cache.save(thread_max); cache.sync(); real this_site_max = static_cast(0); diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh index 56bd00f425..4577e66fcd 100644 --- a/include/kernels/gauge_stout.cuh +++ b/include/kernels/gauge_stout.cuh @@ -135,8 +135,8 @@ namespace quda } Link U, Q; - ThreadLocalCache Stap{}; - ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations + ThreadLocalCache Stap; + ThreadLocalCache Rect; // offset by Stap type to ensure non-overlapping allocations // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh index 6e91e1ac31..48c7e6c1cc 100644 --- a/include/kernels/gauge_utils.cuh +++ b/include/kernels/gauge_utils.cuh @@ -2,7 +2,6 @@ #include #include #include -#include namespace quda { diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh index 050295f271..ae28956112 100644 --- a/include/kernels/gauge_wilson_flow.cuh +++ b/include/kernels/gauge_wilson_flow.cuh @@ -72,8 +72,8 @@ namespace quda // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: // https://arxiv.org/abs/0801.1165 - ThreadLocalCache Stap{}; - ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations + ThreadLocalCache Stap; + ThreadLocalCache Rect; // offset by Stap type to ensure non-overlapping allocations computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim); Z = arg.coeff1x1 * static_cast(Stap) + arg.coeff2x1 * static_cast(Rect); break; diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh index 84173a9a5a..35ddde688d 100644 --- a/include/kernels/hisq_paths_force.cuh +++ b/include/kernels/hisq_paths_force.cuh @@ -538,7 +538,6 @@ namespace quda { * The "extra" low point corresponds to the Lepage contribution to the * force_mu term. * - * * sig * F E * | | diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h index b7dacbb9cf..2da596f6ed 100644 --- a/include/targets/cuda/shared_memory_helper.h +++ b/include/targets/cuda/shared_memory_helper.h @@ -14,9 +14,9 @@ namespace quda /** @brief Class which is used to allocate and access shared memory. The shared memory is treated as an array of type T, with the - number of elements given by the call to the static member - S::size(target::block_dim()). The offset from the beginning of - the total shared memory block is given by the static member + number of elements given by a call to the static member + S::size(target::block_dim()). The byte offset from the beginning + of the total shared memory block is given by the static member O::shared_mem_size(target::block_dim()), or 0 if O is void. */ template class SharedMemory @@ -26,7 +26,6 @@ namespace quda private: T *data; - const unsigned int size; // number of elements of type T /** @brief This is a dummy instantiation for the host compiler @@ -57,6 +56,9 @@ namespace quda } public: + /** + @brief Byte offset for this shared memory object. + */ static constexpr unsigned int get_offset(dim3 block) { unsigned int o = 0; @@ -64,6 +66,9 @@ namespace quda return o; } + /** + @brief Shared memory size in bytes. + */ static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block)*sizeof(T); @@ -72,10 +77,12 @@ namespace quda /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), - size(S::size(target::block_dim())) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {} - constexpr auto smem() const { return *this; } + /** + @brief Return this SharedMemory object. + */ + constexpr auto sharedMem() const { return *this; } /** @brief Subscripting operator returning a reference to element. diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h index f5e965576d..c88a178e2c 100644 --- a/include/targets/cuda/thread_array.h +++ b/include/targets/cuda/thread_array.h @@ -7,6 +7,7 @@ #else #include + namespace quda { template struct thread_array : array {}; diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h index a08cefa84b..f8faf41b44 100644 --- a/include/targets/generic/helpers.h +++ b/include/targets/generic/helpers.h @@ -3,21 +3,33 @@ namespace quda { + /** + @brief Element type used for coalesced storage. + */ template using atom_t = std::conditional_t>; + /** + @brief Used to declare an object of fixed size. + */ template struct SizeStatic { static constexpr unsigned int size(dim3) { return N; } }; + /** + @brief Used to declare an object of fixed size per thread, N. + */ template struct SizePerThread { static constexpr unsigned int size(dim3 block) { return N * block.x * block.y * block.z; } }; + /** + @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D. + */ template struct SizeDims { static constexpr unsigned int size(dim3 block) { dim3 dims = D::dims(block); @@ -25,12 +37,18 @@ namespace quda } }; + /** + @brief Used to declare an object with dimensions given by the block size. + */ struct DimsBlock { static constexpr dim3 dims(dim3 block) { return block; } }; + /** + @brief Used to declare an object with fixed dimensions. + */ template struct DimsStatic { static constexpr dim3 dims(dim3) { diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h index 149a7016f3..1866375f76 100644 --- a/include/targets/generic/shared_memory_cache_helper.h +++ b/include/targets/generic/shared_memory_cache_helper.h @@ -11,46 +11,38 @@ sharing data between threads in a thread block. */ -/** - @file shared_memory_cache_helper.h - @brief Convenience overloads to allow SharedMemoryCache objects to - appear in simple expressions. The actual implementation of - SharedMemoryCache is target specific, and located in e.g., - include/targets/cuda/shared_memory_cache_helper.h, etc. - */ - namespace quda { /** @brief Class which wraps around a shared memory cache for type T, where each thread in the thread block stores a unique value in - the cache which any other thread can access. + the cache which any other thread can access. The data is stored + in a coalesced order with element size atom_t. - This accessor supports both explicit run-time block size and - compile-time sizing. + The dimensions of the cache is determined by a call to + D::dims(target::block_dim()), and D defaults to having dimensions + equal to the block dimensions. - * For run-time block size, the constructor should be initialied - with the desired block size. - - * For compile-time block size, no arguments should be passed to - the constructor, and then the second and third template - parameters correspond to the y and z dimensions of the block, - respectively. The x dimension of the block will be set - according the maximum number of threads possible, given these - dimensions. + A byte offset into the shared memory region can be specified with + the type O, and is given by + O::shared_mem_size(target::block_dim()) if O is not void. */ template class SharedMemoryCache : SharedMemory, SizeDims)>, O> { + using Smem = SharedMemory, SizeDims)>, O>; + public: using value_type = T; using dims_type = D; - using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one - using Smem = SharedMemory, SizeDims)>, O>; + using offset_type = O; using Smem::shared_mem_size; private: + const dim3 block; + const int stride; + using Smem::sharedMem; using atom_t = atom_t; static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types"); @@ -60,21 +52,13 @@ namespace quda // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array) template using maybeT = std::conditional_t,T,void>; - const dim3 block; - const int stride; - - //constexpr Smem smem() const { return *dynamic_cast(this); } - using Smem::smem; - //constexpr Smem smem() const { return Smem::smem(); } - //constexpr Smem smem() const { return *Smem::smemp(); } - __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const { atom_t tmp[n_element]; memcpy(tmp, (void *)&a, sizeof(T)); int j = (z * block.y + y) * block.x + x; #pragma unroll - for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i]; + for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i]; } template @@ -83,7 +67,7 @@ namespace quda atom_t tmp[n_element]; int j = (z * block.y + y) * block.x + x; #pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j]; + for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[i * stride + j]; T a; memcpy((void *)&a, tmp, sizeof(T)); return a; @@ -105,28 +89,20 @@ namespace quda public: /** - @brief constructor for SharedMemory cache. If no arguments are - pass, then the dimensions are set according to the templates - block_size_y and block_size_z, together with the derived - block_size_x. Otherwise use the block sizes passed into the - constructor. - - @param[in] block Block dimensions for the 3-d shared memory object - @param[in] thread_offset "Perceived" offset from dynamic shared - memory base pointer (used when we have multiple caches in - scope). Need to include block size to actual offset. + @brief Constructor for SharedMemoryCache. */ constexpr SharedMemoryCache() : block(D::dims(target::block_dim())), stride(block.x * block.y * block.z) { - static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims::size(dim3{8,8,8})*sizeof(T)); + // sanity check + static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims::size(dim3{32,16,8})*sizeof(T)); } /** @brief Grab the raw base address to shared memory. */ __device__ __host__ inline auto data() const { - return reinterpret_cast(&smem()[0]); + return reinterpret_cast(&sharedMem()[0]); } /** @@ -302,11 +278,10 @@ namespace quda /** @brief Uniform helper for exposing type T, whether we are dealing - with an instance of T or SharedMemoryCache + with an instance of T or SharedMemoryCache */ template - struct get_type< - T, std::enable_if_t>>> { + struct get_type>>> { using type = typename T::value_type; }; diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h index 583468c036..5325e5ab8f 100644 --- a/include/targets/generic/thread_array.h +++ b/include/targets/generic/thread_array.h @@ -8,29 +8,28 @@ namespace quda { /** - @brief Class that provides indexable per-thread storage. On CUDA - this maps to using assigning each thread a unique window of - shared memory using the SharedMemoryCache object. + @brief Class that provides indexable per-thread storage for n + elements of type T. This version uses shared memory for storage. + The offset into the shared memory region is determined from the + type O. */ template class thread_array : SharedMemory, SizePerThread<1>, O> { using Smem = SharedMemory, SizePerThread<1>, O>; + using Smem::sharedMem; array &array_; - //constexpr Smem smem() const { return *dynamic_cast(this); } - using Smem::smem; - public: __device__ __host__ constexpr thread_array() : - array_(smem()[target::thread_idx_linear<3>()]) + array_(sharedMem()[target::thread_idx_linear<3>()]) { array_ = array(); // call default constructor } template __device__ __host__ constexpr thread_array(T first, const Ts... other) : - array_(smem()[target::thread_idx_linear<3>()]) + array_(sharedMem()[target::thread_idx_linear<3>()]) { array_ = array {first, other...}; } diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index 545a365e25..277b16903c 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -15,39 +15,38 @@ namespace quda { /** - @brief Class for threads to store a unique value, or array of values, which can use - shared memory for optimization purposes. + @brief Class for threads to store a unique value (for N_ == 0), + or array of values (for N_ > 0), which can use shared memory for + optimization purposes. */ template class ThreadLocalCache : SharedMemory, SizePerThread)>, O> { + using Smem = SharedMemory, SizePerThread)>, O>; + public: using value_type = T; static constexpr int N = N_; // size of array, 0 means to behave like T instead of array using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one static constexpr int len = std::max(1,N); // actual number of elements to store - using Smem = SharedMemory, SizePerThread)>, O>; using Smem::shared_mem_size; private: + const int stride; + using Smem::sharedMem; using atom_t = atom_t; static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types"); // The number of elements of type atom_t that we break T into for optimal shared-memory access static constexpr int n_element = sizeof(T) / sizeof(atom_t); - const int stride; - - //constexpr Smem smem() const { return *dynamic_cast(this); } - using Smem::smem; - __device__ __host__ inline void save_detail(const T &a, const int k) const { atom_t tmp[n_element]; memcpy(tmp, (void *)&a, sizeof(T)); int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) smem()[(k*n_element + i) * stride + j] = tmp[i]; + for (int i = 0; i < n_element; i++) sharedMem()[(k*n_element + i) * stride + j] = tmp[i]; } __device__ __host__ inline T load_detail(const int k) const @@ -55,7 +54,7 @@ namespace quda atom_t tmp[n_element]; int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = smem()[(k*n_element + i) * stride + j]; + for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k*n_element + i) * stride + j]; T a; memcpy((void *)&a, tmp, sizeof(T)); return a; @@ -66,7 +65,8 @@ namespace quda @brief Constructor for ThreadLocalCache. */ constexpr ThreadLocalCache() : stride(target::block_size<3>()) { - static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizePerThread::size(dim3{8,8,8})*sizeof(T)); + // sanity check + static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizePerThread::size(dim3{32,16,8})*sizeof(T)); } /** @@ -83,7 +83,10 @@ namespace quda @param[in] a The value to store in the thread local cache @param[in] k The index to use */ - __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); } + __device__ __host__ inline void save(const T &a, const int k) const { + static_assert(N > 0); + save_detail(a, k); + } /** @brief Load a value from the thread local cache. Used when N==0 so cache acts like single object. @@ -99,7 +102,10 @@ namespace quda @param[in] k The index to use @return The value stored in the thread local cache at that index */ - __device__ __host__ inline T load(const int k) const { return load_detail(k); } + __device__ __host__ inline T load(const int k) const { + static_assert(N > 0); + return load_detail(k); + } /** @brief Cast operator to allow cache objects to be used where T is expected (when N==0). @@ -123,7 +129,10 @@ namespace quda @param[in] i The index to use @return The value stored in the thread local cache at that index */ - __device__ __host__ T operator[](int i) { return load(i); } + __device__ __host__ T operator[](int i) { + static_assert(N > 0); + return load(i); + } }; template __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) @@ -165,7 +174,7 @@ namespace quda /** @brief Uniform helper for exposing type T, whether we are dealing - with an instance of T or ThreadLocalCache + with an instance of T or ThreadLocalCache */ template struct get_type>>> { diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h index b7dacbb9cf..2da596f6ed 100644 --- a/include/targets/hip/shared_memory_helper.h +++ b/include/targets/hip/shared_memory_helper.h @@ -14,9 +14,9 @@ namespace quda /** @brief Class which is used to allocate and access shared memory. The shared memory is treated as an array of type T, with the - number of elements given by the call to the static member - S::size(target::block_dim()). The offset from the beginning of - the total shared memory block is given by the static member + number of elements given by a call to the static member + S::size(target::block_dim()). The byte offset from the beginning + of the total shared memory block is given by the static member O::shared_mem_size(target::block_dim()), or 0 if O is void. */ template class SharedMemory @@ -26,7 +26,6 @@ namespace quda private: T *data; - const unsigned int size; // number of elements of type T /** @brief This is a dummy instantiation for the host compiler @@ -57,6 +56,9 @@ namespace quda } public: + /** + @brief Byte offset for this shared memory object. + */ static constexpr unsigned int get_offset(dim3 block) { unsigned int o = 0; @@ -64,6 +66,9 @@ namespace quda return o; } + /** + @brief Shared memory size in bytes. + */ static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block)*sizeof(T); @@ -72,10 +77,12 @@ namespace quda /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), - size(S::size(target::block_dim())) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {} - constexpr auto smem() const { return *this; } + /** + @brief Return this SharedMemory object. + */ + constexpr auto sharedMem() const { return *this; } /** @brief Subscripting operator returning a reference to element. From 5889bc45c1bf2b4c8c51889410de48f7668adecb Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 1 Sep 2023 08:17:26 -0500 Subject: [PATCH 17/27] format --- include/targets/cuda/shared_memory_helper.h | 12 +--- include/targets/cuda/thread_array.h | 5 +- include/targets/generic/helpers.h | 24 +++---- .../generic/shared_memory_cache_helper.h | 35 +++++------ include/targets/generic/thread_array.h | 8 +-- include/targets/generic/thread_local_cache.h | 62 ++++++++++++------- include/targets/hip/shared_memory_helper.h | 12 +--- 7 files changed, 74 insertions(+), 84 deletions(-) diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h index 2da596f6ed..bc9bd7c66b 100644 --- a/include/targets/cuda/shared_memory_helper.h +++ b/include/targets/cuda/shared_memory_helper.h @@ -50,10 +50,7 @@ namespace quda } }; - __device__ __host__ inline T *cache(unsigned int offset) const - { - return target::dispatch(offset); - } + __device__ __host__ inline T *cache(unsigned int offset) const { return target::dispatch(offset); } public: /** @@ -69,15 +66,12 @@ namespace quda /** @brief Shared memory size in bytes. */ - static constexpr unsigned int shared_mem_size(dim3 block) - { - return get_offset(block) + S::size(block)*sizeof(T); - } + static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block) * sizeof(T); } /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) { } /** @brief Return this SharedMemory object. diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h index c88a178e2c..8237fcb87d 100644 --- a/include/targets/cuda/thread_array.h +++ b/include/targets/cuda/thread_array.h @@ -10,7 +10,8 @@ namespace quda { - template struct thread_array : array {}; -} + template struct thread_array : array { + }; +} // namespace quda #endif diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h index f8faf41b44..fcf673db8c 100644 --- a/include/targets/generic/helpers.h +++ b/include/targets/generic/helpers.h @@ -13,25 +13,22 @@ namespace quda @brief Used to declare an object of fixed size. */ template struct SizeStatic { - static constexpr unsigned int size(dim3) { - return N; - } + static constexpr unsigned int size(dim3) { return N; } }; /** @brief Used to declare an object of fixed size per thread, N. */ template struct SizePerThread { - static constexpr unsigned int size(dim3 block) { - return N * block.x * block.y * block.z; - } + static constexpr unsigned int size(dim3 block) { return N * block.x * block.y * block.z; } }; /** @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D. */ template struct SizeDims { - static constexpr unsigned int size(dim3 block) { + static constexpr unsigned int size(dim3 block) + { dim3 dims = D::dims(block); return dims.x * dims.y * dims.z * N; } @@ -41,19 +38,14 @@ namespace quda @brief Used to declare an object with dimensions given by the block size. */ struct DimsBlock { - static constexpr dim3 dims(dim3 block) { - return block; - } + static constexpr dim3 dims(dim3 block) { return block; } }; /** @brief Used to declare an object with fixed dimensions. */ - template - struct DimsStatic { - static constexpr dim3 dims(dim3) { - return dim3(x,y,z); - } + template struct DimsStatic { + static constexpr dim3 dims(dim3) { return dim3(x, y, z); } }; /** @@ -64,4 +56,4 @@ namespace quda using type = T; }; -} +} // namespace quda diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h index 1866375f76..3e4ad31302 100644 --- a/include/targets/generic/shared_memory_cache_helper.h +++ b/include/targets/generic/shared_memory_cache_helper.h @@ -29,9 +29,9 @@ namespace quda O::shared_mem_size(target::block_dim()) if O is not void. */ template - class SharedMemoryCache : SharedMemory, SizeDims)>, O> + class SharedMemoryCache : SharedMemory, SizeDims)>, O> { - using Smem = SharedMemory, SizeDims)>, O>; + using Smem = SharedMemory, SizeDims)>, O>; public: using value_type = T; @@ -50,7 +50,7 @@ namespace quda static constexpr int n_element = sizeof(T) / sizeof(atom_t); // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array) - template using maybeT = std::conditional_t,T,void>; + template using maybeT = std::conditional_t, T, void>; __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const { @@ -61,8 +61,7 @@ namespace quda for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i]; } - template - __device__ __host__ inline maybeT load_detail(int x, int y, int z) const + template __device__ __host__ inline maybeT load_detail(int x, int y, int z) const { atom_t tmp[n_element]; int j = (z * block.y + y) * block.x + x; @@ -91,19 +90,17 @@ namespace quda /** @brief Constructor for SharedMemoryCache. */ - constexpr SharedMemoryCache() : - block(D::dims(target::block_dim())), stride(block.x * block.y * block.z) + constexpr SharedMemoryCache() : block(D::dims(target::block_dim())), stride(block.x * block.y * block.z) { // sanity check - static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims::size(dim3{32,16,8})*sizeof(T)); + static_assert(shared_mem_size(dim3 {32, 16, 8}) + == Smem::get_offset(dim3 {32, 16, 8}) + SizeDims::size(dim3 {32, 16, 8}) * sizeof(T)); } /** @brief Grab the raw base address to shared memory. */ - __device__ __host__ inline auto data() const { - return reinterpret_cast(&sharedMem()[0]); - } + __device__ __host__ inline auto data() const { return reinterpret_cast(&sharedMem()[0]); } /** @brief Save the value into the 3-d shared memory cache. @@ -179,8 +176,7 @@ namespace quda @param[in] x The x index to use @return The value at coordinates (x,y,z) */ - template - __device__ __host__ inline maybeT load_x(int x = -1) const + template __device__ __host__ inline maybeT load_x(int x = -1) const { auto tid = target::thread_idx(); x = (x == -1) ? tid.x : x; @@ -192,8 +188,7 @@ namespace quda @param[in] y The y index to use @return The value at coordinates (x,y,z) */ - template - __device__ __host__ inline maybeT load_y(int y = -1) const + template __device__ __host__ inline maybeT load_y(int y = -1) const { auto tid = target::thread_idx(); y = (y == -1) ? tid.y : y; @@ -205,8 +200,7 @@ namespace quda @param[in] z The z index to use @return The value at coordinates (x,y,z) */ - template - __device__ __host__ inline maybeT load_z(int z = -1) const + template __device__ __host__ inline maybeT load_z(int z = -1) const { auto tid = target::thread_idx(); z = (z == -1) ? tid.z : z; @@ -222,8 +216,7 @@ namespace quda @brief Cast operator to allow cache objects to be used where T is expected */ - template - __device__ __host__ operator maybeT() const { return load(); } + template __device__ __host__ operator maybeT() const { return load(); } /** @brief Assignment operator to allow cache objects to be used on @@ -281,7 +274,9 @@ namespace quda with an instance of T or SharedMemoryCache */ template - struct get_type>>> { + struct get_type>>> { using type = typename T::value_type; }; diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h index 5325e5ab8f..0e641a11df 100644 --- a/include/targets/generic/thread_array.h +++ b/include/targets/generic/thread_array.h @@ -13,16 +13,14 @@ namespace quda The offset into the shared memory region is determined from the type O. */ - template - class thread_array : SharedMemory, SizePerThread<1>, O> + template class thread_array : SharedMemory, SizePerThread<1>, O> { - using Smem = SharedMemory, SizePerThread<1>, O>; + using Smem = SharedMemory, SizePerThread<1>, O>; using Smem::sharedMem; array &array_; public: - __device__ __host__ constexpr thread_array() : - array_(sharedMem()[target::thread_idx_linear<3>()]) + __device__ __host__ constexpr thread_array() : array_(sharedMem()[target::thread_idx_linear<3>()]) { array_ = array(); // call default constructor } diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index 277b16903c..7d33ad40e0 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -19,16 +19,16 @@ namespace quda or array of values (for N_ > 0), which can use shared memory for optimization purposes. */ - template class ThreadLocalCache : - SharedMemory, SizePerThread)>, O> + template + class ThreadLocalCache : SharedMemory, SizePerThread)>, O> { - using Smem = SharedMemory, SizePerThread)>, O>; + using Smem = SharedMemory, SizePerThread)>, O>; public: using value_type = T; static constexpr int N = N_; // size of array, 0 means to behave like T instead of array - using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one - static constexpr int len = std::max(1,N); // actual number of elements to store + using offset_type = O; // type of object using shared memory at the same time that is located before this one + static constexpr int len = std::max(1, N); // actual number of elements to store using Smem::shared_mem_size; private: @@ -46,7 +46,7 @@ namespace quda memcpy(tmp, (void *)&a, sizeof(T)); int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) sharedMem()[(k*n_element + i) * stride + j] = tmp[i]; + for (int i = 0; i < n_element; i++) sharedMem()[(k * n_element + i) * stride + j] = tmp[i]; } __device__ __host__ inline T load_detail(const int k) const @@ -54,7 +54,7 @@ namespace quda atom_t tmp[n_element]; int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k*n_element + i) * stride + j]; + for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k * n_element + i) * stride + j]; T a; memcpy((void *)&a, tmp, sizeof(T)); return a; @@ -64,16 +64,19 @@ namespace quda /** @brief Constructor for ThreadLocalCache. */ - constexpr ThreadLocalCache() : stride(target::block_size<3>()) { + constexpr ThreadLocalCache() : stride(target::block_size<3>()) + { // sanity check - static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizePerThread::size(dim3{32,16,8})*sizeof(T)); + static_assert(shared_mem_size(dim3 {32, 16, 8}) + == Smem::get_offset(dim3 {32, 16, 8}) + SizePerThread::size(dim3 {32, 16, 8}) * sizeof(T)); } /** @brief Save the value into the thread local cache. Used when N==0 so cache acts like single object. @param[in] a The value to store in the thread local cache */ - __device__ __host__ inline void save(const T &a) const { + __device__ __host__ inline void save(const T &a) const + { static_assert(N == 0); save_detail(a, 0); } @@ -83,7 +86,8 @@ namespace quda @param[in] a The value to store in the thread local cache @param[in] k The index to use */ - __device__ __host__ inline void save(const T &a, const int k) const { + __device__ __host__ inline void save(const T &a, const int k) const + { static_assert(N > 0); save_detail(a, k); } @@ -92,7 +96,8 @@ namespace quda @brief Load a value from the thread local cache. Used when N==0 so cache acts like single object. @return The value stored in the thread local cache */ - __device__ __host__ inline T load() const { + __device__ __host__ inline T load() const + { static_assert(N == 0); return load_detail(0); } @@ -102,7 +107,8 @@ namespace quda @param[in] k The index to use @return The value stored in the thread local cache at that index */ - __device__ __host__ inline T load(const int k) const { + __device__ __host__ inline T load(const int k) const + { static_assert(N > 0); return load_detail(k); } @@ -110,7 +116,8 @@ namespace quda /** @brief Cast operator to allow cache objects to be used where T is expected (when N==0). */ - __device__ __host__ operator T() const { + __device__ __host__ operator T() const + { static_assert(N == 0); return load(); } @@ -119,7 +126,8 @@ namespace quda @brief Assignment operator to allow cache objects to be used on the lhs where T is otherwise expected (when N==0). */ - __device__ __host__ void operator=(const T &src) const { + __device__ __host__ void operator=(const T &src) const + { static_assert(N == 0); save(src); } @@ -129,39 +137,46 @@ namespace quda @param[in] i The index to use @return The value stored in the thread local cache at that index */ - __device__ __host__ T operator[](int i) { + __device__ __host__ T operator[](int i) + { static_assert(N > 0); return load(i); } }; - template __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) + template + __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) { return static_cast(a) + b; } - template __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache &b) + template + __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache &b) { return a + static_cast(b); } - template __device__ __host__ inline T operator-(const ThreadLocalCache &a, const T &b) + template + __device__ __host__ inline T operator-(const ThreadLocalCache &a, const T &b) { return static_cast(a) - b; } - template __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache &b) + template + __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache &b) { return a - static_cast(b); } - template __device__ __host__ inline auto operator+=(ThreadLocalCache &a, const T &b) + template + __device__ __host__ inline auto operator+=(ThreadLocalCache &a, const T &b) { a.save(static_cast(a) + b); return a; } - template __device__ __host__ inline auto operator-=(ThreadLocalCache &a, const T &b) + template + __device__ __host__ inline auto operator-=(ThreadLocalCache &a, const T &b) { a.save(static_cast(a) - b); return a; @@ -177,7 +192,8 @@ namespace quda with an instance of T or ThreadLocalCache */ template - struct get_type>>> { + struct get_type< + T, std::enable_if_t>>> { using type = typename T::value_type; }; diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h index 2da596f6ed..bc9bd7c66b 100644 --- a/include/targets/hip/shared_memory_helper.h +++ b/include/targets/hip/shared_memory_helper.h @@ -50,10 +50,7 @@ namespace quda } }; - __device__ __host__ inline T *cache(unsigned int offset) const - { - return target::dispatch(offset); - } + __device__ __host__ inline T *cache(unsigned int offset) const { return target::dispatch(offset); } public: /** @@ -69,15 +66,12 @@ namespace quda /** @brief Shared memory size in bytes. */ - static constexpr unsigned int shared_mem_size(dim3 block) - { - return get_offset(block) + S::size(block)*sizeof(T); - } + static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block) * sizeof(T); } /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) { } /** @brief Return this SharedMemory object. From 9f7074f3103cee6afdcbf87055b77ad72b39ab33 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 7 Sep 2023 08:50:36 -0700 Subject: [PATCH 18/27] Fix some issues TuneParam::shared_bytes settings with some of the tuning classes --- include/tunable_block_reduction.h | 8 +++++++- include/tunable_nd.h | 19 ++++++------------- include/tunable_reduction.h | 13 ++++++++++++- include/tune_quda.h | 19 +++++++++++-------- 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h index f4552605e5..8c9af36d5a 100644 --- a/include/tunable_block_reduction.h +++ b/include/tunable_block_reduction.h @@ -162,12 +162,18 @@ namespace quda return true; } else { // block.x (spacetime) was reset + auto next = param; + next.block.z += step_z; + auto shared_bytes = setSharedBytes(next); + // we can advance spin/block-color since this is valid if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2) && param.block.x * param.block.y * (param.block.z + step_z) <= device::max_threads_per_block() - && ((param.block.z + step_z) <= max_block_z)) { + && ((param.block.z + step_z) <= max_block_z) + && shared_bytes <= this->maxSharedBytesPerBlock()) { param.block.z += step_z; param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z; + param.shared_bytes = shared_bytes; return true; } else { // we have run off the end so let's reset param.block.z = step_z; diff --git a/include/tunable_nd.h b/include/tunable_nd.h index 43af42f932..b942bc041d 100644 --- a/include/tunable_nd.h +++ b/include/tunable_nd.h @@ -265,8 +265,7 @@ namespace quda auto next = param; next.block.y += step_y; - auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z, - this->sharedBytesPerBlock(next)); + auto shared_bytes = this->setSharedBytes(next); // we can advance spin/block-color since this is valid if (param.block.y < vector_length_y && param.block.y < device::max_threads_per_block_dim(1) @@ -279,7 +278,6 @@ namespace quda } else { // we have run off the end so let's reset param.block.y = step_y; param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y; - return false; } } @@ -294,8 +292,7 @@ namespace quda Tunable::initTuneParam(param); param.block.y = step_y; param.grid.y = (vector_length_y + step_y - 1) / step_y; - param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z, - this->sharedBytesPerBlock(param)); + this->setSharedBytes(param); } /** @@ -307,8 +304,7 @@ namespace quda Tunable::defaultTuneParam(param); param.block.y = step_y; param.grid.y = (vector_length_y + step_y - 1) / step_y; - param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z, - this->sharedBytesPerBlock(param)); + this->setSharedBytes(param); } /** @@ -524,8 +520,7 @@ namespace quda auto next = param; next.block.z += step_z; - auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z, - this->sharedBytesPerBlock(next)); + auto shared_bytes = this->setSharedBytes(next); // we can advance spin/block-color since this is valid if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2) @@ -552,8 +547,7 @@ namespace quda TunableKernel2D_base::initTuneParam(param); param.block.z = step_z; param.grid.z = (vector_length_z + step_z - 1) / step_z; - param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z, - this->sharedBytesPerBlock(param)); + this->setSharedBytes(param); } /** @@ -565,8 +559,7 @@ namespace quda TunableKernel2D_base::defaultTuneParam(param); param.block.z = step_z; param.grid.z = (vector_length_z + step_z - 1) / step_z; - param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z, - this->sharedBytesPerBlock(param)); + this->setSharedBytes(param); } /** diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h index d24904b2fd..a18030fd93 100644 --- a/include/tunable_reduction.h +++ b/include/tunable_reduction.h @@ -166,6 +166,7 @@ namespace quda { TunableKernel::initTuneParam(param); param.block.y = block_size_y; + setSharedBytes(param); } /** @@ -176,6 +177,7 @@ namespace quda { TunableKernel::defaultTuneParam(param); param.block.y = block_size_y; + setSharedBytes(param); } }; @@ -323,11 +325,18 @@ namespace quda if (rtn) { return true; } else { + + auto next = param; + next.block.z++; + auto shared_bytes = setSharedBytes(next); + if (param.block.z < n_batch && param.block.z < device::max_threads_per_block_dim(2) && param.block.x * param.block.y * (param.block.z + 1) <= device::max_threads_per_block() - && param.block.z < n_batch_block_max) { + && param.block.z < n_batch_block_max + && shared_bytes <= this->maxSharedBytesPerBlock()) { param.block.z++; param.grid.z = (n_batch + param.block.z - 1) / param.block.z; + param.shared_bytes = shared_bytes; return true; } else { // we have run off the end so let's reset param.block.z = 1; @@ -346,6 +355,7 @@ namespace quda TunableReduction2D::initTuneParam(param); param.block = {param.block.x, param.block.y, 1}; param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z}; + setSharedBytes(param); } /** @@ -357,6 +367,7 @@ namespace quda TunableReduction2D::defaultTuneParam(param); param.block = {param.block.x, param.block.y, 1}; param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z}; + setSharedBytes(param); } }; diff --git a/include/tune_quda.h b/include/tune_quda.h index 1511f6f881..9f88bbd6f9 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -150,6 +150,13 @@ namespace quda { } } + auto setSharedBytes(TuneParam ¶m) const + { + int nthreads = param.block.x * param.block.y * param.block.z; + param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param)); + return param.shared_bytes; + } + virtual bool advanceBlockDim(TuneParam ¶m) const { const unsigned int max_threads = maxBlockSize(param); @@ -157,14 +164,12 @@ namespace quda { bool ret; param.block.x += blockStep(); - int nthreads = param.block.x * param.block.y * param.block.z; - param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param)); + setSharedBytes(param); if (param.block.x > max_threads || param.shared_bytes > max_shared || param.block.x * param.block.y * param.block.z > device::max_threads_per_block()) { resetBlockDim(param); - int nthreads = param.block.x * param.block.y * param.block.z; - param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param)); + setSharedBytes(param); ret = false; } else { ret = true; @@ -214,8 +219,7 @@ namespace quda { if (param.shared_bytes > max_shared) { TuneParam next(param); advanceBlockDim(next); // to get next blockDim - int nthreads = next.block.x * next.block.y * next.block.z; - param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(next)); + param.shared_bytes = setSharedBytes(next); return false; } else { return true; @@ -325,8 +329,7 @@ namespace quda { param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1); } - int nthreads = param.block.x*param.block.y*param.block.z; - param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param)); + setSharedBytes(param); } /** sets default values for when tuning is disabled */ From 12c5c9899197215a5ee49d75de6413aa84ff337f Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 7 Sep 2023 09:03:50 -0700 Subject: [PATCH 19/27] Fix sharedBytesPerThread for kernels that utilize thread_array --- lib/clover_deriv_quda.cu | 1 + lib/gauge_ape.cu | 1 + lib/gauge_field_strength_tensor.cu | 1 + lib/gauge_force.cu | 1 + lib/gauge_loop_trace.cu | 7 ++++--- lib/gauge_stout.cu | 3 ++- lib/gauge_wilson_flow.cu | 3 ++- 7 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu index 34f121de93..08702f322e 100644 --- a/lib/clover_deriv_quda.cu +++ b/lib/clover_deriv_quda.cu @@ -12,6 +12,7 @@ namespace quda { double coeff; int parity; unsigned int minThreads() const { return gauge.LocalVolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: DerivativeClover(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, int parity) : diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu index 5ace8e5a29..f4a8388f87 100644 --- a/lib/gauge_ape.cu +++ b/lib/gauge_ape.cu @@ -13,6 +13,7 @@ namespace quda { const GaugeField ∈ const Float alpha; unsigned int minThreads() const { return in.LocalVolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim diff --git a/lib/gauge_field_strength_tensor.cu b/lib/gauge_field_strength_tensor.cu index d0ec026881..d7a555b890 100644 --- a/lib/gauge_field_strength_tensor.cu +++ b/lib/gauge_field_strength_tensor.cu @@ -11,6 +11,7 @@ namespace quda GaugeField &f; const GaugeField &u; unsigned int minThreads() const { return f.VolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: Fmunu(const GaugeField &u, GaugeField &f) : diff --git a/lib/gauge_force.cu b/lib/gauge_force.cu index 2558dadcac..2c89045005 100644 --- a/lib/gauge_force.cu +++ b/lib/gauge_force.cu @@ -12,6 +12,7 @@ namespace quda { double epsilon; const paths<4> &p; unsigned int minThreads() const { return mom.VolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: ForceGauge(const GaugeField &u, GaugeField &mom, double epsilon, const paths<4> &p) : diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu index faaaa97d99..9ec8aec5a8 100644 --- a/lib/gauge_loop_trace.cu +++ b/lib/gauge_loop_trace.cu @@ -13,6 +13,7 @@ namespace quda { std::vector& loop_traces; double factor; const paths<1> p; + unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for threda_array public: // max block size of 8 is arbitrary for now @@ -31,14 +32,14 @@ namespace quda { apply(device::get_default_stream()); } - void apply(const qudaStream_t &stream) + void apply(const qudaStream_t &stream) override { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); GaugeLoopTraceArg arg(u, factor, p); launch(loop_traces, tp, stream, arg); } - long long flops() const + long long flops() const override { auto Nc = u.Ncolor(); auto mat_mul_flops = 8ll * Nc * Nc * Nc - 2 * Nc * Nc; @@ -46,7 +47,7 @@ namespace quda { return (p.count * mat_mul_flops + p.num_paths * (2 * Nc + 2)) * u.Volume(); } - long long bytes() const { + long long bytes() const override { // links * one LatticeColorMatrix worth of data return p.count * u.Bytes() / 4; } diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index 6177ef8170..bbb676b214 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -19,7 +19,8 @@ namespace quda { unsigned int sharedBytesPerThread() const { // use SharedMemoryCache if using over improvement for two link fields - return improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0; + return (improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0) + + 4 * sizeof(int); // for thread_array } public: diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu index a3ce38ba81..32768bb05a 100644 --- a/lib/gauge_wilson_flow.cu +++ b/lib/gauge_wilson_flow.cu @@ -25,7 +25,8 @@ namespace quda { unsigned int sharedBytesPerThread() const { // use SharedMemoryCache if using Symanzik improvement for two Link fields - return wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0; + return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0) + + 4 * sizeof(int); // for thread_array } public: From 5ce1230208e2f7e5295eab3e13f9a074443e92c7 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 7 Sep 2023 17:08:35 -0500 Subject: [PATCH 20/27] fix overlapping shared mem --- include/kernels/gauge_stout.cuh | 3 ++- include/kernels/gauge_utils.cuh | 2 ++ include/kernels/gauge_wilson_flow.cuh | 3 ++- include/targets/cuda/thread_array.h | 1 + include/targets/generic/thread_array.h | 2 ++ 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh index 4577e66fcd..712f191f83 100644 --- a/include/kernels/gauge_stout.cuh +++ b/include/kernels/gauge_stout.cuh @@ -135,7 +135,8 @@ namespace quda } Link U, Q; - ThreadLocalCache Stap; + //ThreadLocalCache Stap; + ThreadLocalCache Stap; ThreadLocalCache Rect; // offset by Stap type to ensure non-overlapping allocations // This function gets stap = S_{mu,nu} i.e., the staple of length 3, diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh index 48c7e6c1cc..ded8c9377a 100644 --- a/include/kernels/gauge_utils.cuh +++ b/include/kernels/gauge_utils.cuh @@ -19,6 +19,7 @@ namespace quda // matrix+matrix = 18 floating-point ops // => Total number of floating point ops per function call // dims * (2*18 + 4*198) = dims*828 + using computeStapleOps = thread_array; template __host__ __device__ inline void computeStaple(const Arg &arg, const int *x, const Int *X, const int parity, const int nu, Staple &staple, const int dir_ignore) { @@ -94,6 +95,7 @@ namespace quda // matrix+matrix = 18 floating-point ops // => Total number of floating point ops per function call // dims * (8*18 + 28*198) = dims*5688 + using computeStapleRectangleOps = thread_array; template __host__ __device__ inline void computeStapleRectangle(const Arg &arg, const int *x, const Int *X, const int parity, const int nu, Staple &staple, Rectangle &rectangle, const int dir_ignore) diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh index ae28956112..22864ce1b0 100644 --- a/include/kernels/gauge_wilson_flow.cuh +++ b/include/kernels/gauge_wilson_flow.cuh @@ -72,7 +72,8 @@ namespace quda // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: // https://arxiv.org/abs/0801.1165 - ThreadLocalCache Stap; + //ThreadLocalCache Stap; + ThreadLocalCache Stap; ThreadLocalCache Rect; // offset by Stap type to ensure non-overlapping allocations computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim); Z = arg.coeff1x1 * static_cast(Stap) + arg.coeff2x1 * static_cast(Rect); diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h index 8237fcb87d..1c4d7f3244 100644 --- a/include/targets/cuda/thread_array.h +++ b/include/targets/cuda/thread_array.h @@ -11,6 +11,7 @@ namespace quda { template struct thread_array : array { + static constexpr unsigned int shared_mem_size(dim3 block) { return 0; } }; } // namespace quda diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h index 0e641a11df..d513394cfc 100644 --- a/include/targets/generic/thread_array.h +++ b/include/targets/generic/thread_array.h @@ -20,6 +20,8 @@ namespace quda array &array_; public: + using Smem::shared_mem_size; + __device__ __host__ constexpr thread_array() : array_(sharedMem()[target::thread_idx_linear<3>()]) { array_ = array(); // call default constructor From 56e0ee68c3c34c4d02b8032451435ffdf0bd2ed6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 19 Sep 2023 22:38:00 -0700 Subject: [PATCH 21/27] Bug fix for VUV/VLV kernels now that dynamic shared memory is used --- lib/coarse_op.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh index 4c23f50781..e8b42cd2f3 100644 --- a/lib/coarse_op.cuh +++ b/lib/coarse_op.cuh @@ -245,7 +245,7 @@ namespace quda { unsigned int sharedBytesPerBlock(const TuneParam ¶m) const override { - if (type == COMPUTE_VUV || type == COMPUTE_VLV) + if ((type == COMPUTE_VUV || type == COMPUTE_VLV) && arg.shared_atomic) return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin; return TunableKernel3D::sharedBytesPerBlock(param); } @@ -577,9 +577,7 @@ namespace quda { if (type == COMPUTE_VUV || type == COMPUTE_VLV || type == COMPUTE_CONVERT || type == COMPUTE_RESCALE) arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim; arg.kd_dagger = kd_dagger; - if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes -= sharedBytesPerBlock(tp); // shared memory is static so don't include it in launch Launch(arg, tp, type, stream); - if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes += sharedBytesPerBlock(tp); // restore shared memory }; /** From c2d336c77ca901e57d5e9fb983e64268641ee7f7 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 19 Sep 2023 23:25:52 -0700 Subject: [PATCH 22/27] Add some shared memory checks when launching kernels --- include/targets/cuda/tunable_kernel.h | 3 +++ include/targets/hip/tunable_kernel.h | 3 +++ include/tune_quda.h | 16 ++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/include/targets/cuda/tunable_kernel.h b/include/targets/cuda/tunable_kernel.h index d7936eb497..7306ab355c 100644 --- a/include/targets/cuda/tunable_kernel.h +++ b/include/targets/cuda/tunable_kernel.h @@ -45,6 +45,7 @@ namespace quda std::enable_if_t(), qudaError_t> launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) { + checkSharedBytes(tp); #ifdef JITIFY launch_error = launch_jitify(kernel.name, tp, stream, arg); #else @@ -62,6 +63,7 @@ namespace quda std::enable_if_t(), qudaError_t> launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) { + checkSharedBytes(tp); #ifdef JITIFY // note we do the copy to constant memory after the kernel has been compiled in launch_jitify launch_error = launch_jitify(kernel.name, tp, stream, arg); @@ -83,6 +85,7 @@ namespace quda template