From aff7f03218b580f8f7f5f7292633082883c33f6c Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 18 May 2023 10:24:42 -0500
Subject: [PATCH 01/27] add ThreadLocalCache

---
 include/kernels/gauge_stout.cuh              |   5 +-
 include/kernels/gauge_utils.cuh              |   1 +
 include/kernels/gauge_wilson_flow.cuh        |   5 +-
 include/kernels/hisq_paths_force.cuh         |  77 +++++------
 include/targets/cuda/thread_local_cache.h    | 132 +++++++++++++++++++
 include/targets/generic/thread_local_cache.h |  68 ++++++++++
 6 files changed, 247 insertions(+), 41 deletions(-)
 create mode 100644 include/targets/cuda/thread_local_cache.h
 create mode 100644 include/targets/generic/thread_local_cache.h
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 45cc176d88..ca22d0e9ac 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -6,6 +6,7 @@
 #include <su3_project.cuh>
 #include <kernels/gauge_utils.cuh>
 #include <kernel.h>
+#include <thread_local_cache.h>
 
 namespace quda
 {
@@ -134,8 +135,8 @@ namespace quda
       }
 
       Link U, Q;
-      SharedMemoryCache<Link> Stap(target::block_dim());
-      SharedMemoryCache<Link> Rect(target::block_dim(), sizeof(Link));
+      ThreadLocalCache<Link> Stap{};
+      ThreadLocalCache<Link,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh
index 48c7e6c1cc..6e91e1ac31 100644
--- a/include/kernels/gauge_utils.cuh
+++ b/include/kernels/gauge_utils.cuh
@@ -2,6 +2,7 @@
 #include <index_helper.cuh>
 #include <quda_matrix.h>
 #include <thread_array.h>
+#include <thread_local_cache.h>
 
 namespace quda
 {
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 327f7c7eb0..1799d3c0dd 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -4,6 +4,7 @@
 #include <kernels/gauge_utils.cuh>
 #include <su3_project.cuh>
 #include <kernel.h>
+#include <thread_local_cache.h>
 
 namespace quda
 {
@@ -71,8 +72,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      SharedMemoryCache<Link> Stap(target::block_dim());
-      SharedMemoryCache<Link> Rect(target::block_dim(), sizeof(Link)); // offset to ensure non-overlapping allocations
+      ThreadLocalCache<Link> Stap{};
+      ThreadLocalCache<Link,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       break;
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index cac909bf8a..522442da3b 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -4,7 +4,7 @@
 #include <index_helper.cuh>
 #include <gauge_field_order.h>
 #include <kernel.h>
-#include <shared_memory_cache_helper.h>
+#include <thread_local_cache.h>
 
 namespace quda {
 
@@ -272,7 +272,7 @@ namespace quda {
      *            A _______ B
      *    mu_next  |       |
      *            H|       |G
-     *   
+     *
      *   Variables have been named to reflection dimensionality for
      *   mu_positive == true, sig_positive == true, mu_next_positive == true
      **************************************************************************/
@@ -372,7 +372,7 @@ namespace quda {
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
         @param[in/out] force_mu Accumulated force in the mu direction
-        @param[in] Uab_cache Shared memory cache that stores the gauge link going from a to b (read)
+        @param[in] Uab_cache Thread local cache that stores the gauge link going from a to b (read)
         @details This subset of the code computes the Lepage contribution to the fermion force.
           Data traffic:
             READ: cb_link, id_link, pMu_at_c
@@ -386,7 +386,8 @@ namespace quda {
           Flops:
             2 multiplies, 1 add, 1 rescale
       */
-      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu, SharedMemoryCache<Link> &Uab_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu, LinkCache &Uab_cache) {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -440,7 +441,7 @@ namespace quda {
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
-        @param[in] Uab_cache Shared memory cache that stores the gauge link going from a to b (read)
+        @param[in] Uab_cache Thread local cache that stores the gauge link going from a to b (read)
           Data traffic:
             READ: gb_link, oProd_at_h
             WRITE: pMu_next_at_b, p3_at_a
@@ -454,7 +455,8 @@ namespace quda {
           Flops:
             2 multiplies, 1 add, 1 rescale
       */
-      __device__ __host__ inline void middle_three(int x[4], int point_a, int parity_a, SharedMemoryCache<Link> &Uab_cache)
+      template <typename LinkCache>
+      __device__ __host__ inline void middle_three(int x[4], int point_a, int parity_a, LinkCache &Uab_cache)
       {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
@@ -535,8 +537,8 @@ namespace quda {
         /*
          * The "extra" low point corresponds to the Lepage contribution to the
          * force_mu term.
-         * 
-         *  
+         *
+         *
          *            sig
          *         F        E
          *          |      |
@@ -557,7 +559,7 @@ namespace quda {
         int point_a = e_cb;
         int parity_a = parity;
 
-        SharedMemoryCache<Link> Uab_cache(target::block_dim());
+        ThreadLocalCache<Link> Uab_cache{};
         // Scoped load of Uab
         {
           int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
@@ -636,7 +638,7 @@ namespace quda {
       Link force;
       Link shortP;
       Link p5;
-      
+
       const Link pMu;
 
       // double-buffer: read pNuMu, qNuMu for side 5, middle 7
@@ -688,7 +690,7 @@ namespace quda {
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
-        @param[in/out] Matrix_cache Shared memory cache that maintains the accumulated P5 contribution (write)
+        @param[in/out] Matrix_cache Thread local cache that maintains the accumulated P5 contribution (write)
                        the gauge link going from a to b (read), as well as force_sig when sig is positive (read/write)
         @details This subset of the code computes the full seven link contribution to the HISQ force.
           Data traffic:
@@ -705,8 +707,8 @@ namespace quda {
           Flops:
             4 multiplies, 2 adds, 2 rescales
       */
-      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a,
-          SharedMemoryCache<Link> &Matrix_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
         auto mycoeff_seven = parity_sign<typename Arg::real>(parity_a) * coeff_sign<sig_positive, typename Arg::real>(parity_a) * arg.coeff_seven;
 
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
@@ -735,19 +737,19 @@ namespace quda {
           UbeOeOf = Ube * OeOf;
 
           // Cache Ube to below
-          Matrix_cache.save_z(Ube, 1);
+          Matrix_cache[1] = Ube;
         }
 
         // Take care of force_sig --- contribution from the negative rho direction
         Link Uaf = arg.link(arg.rho, point_a, parity_a);
         if constexpr (sig_positive) {
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           force_sig = mm_add(mycoeff_seven * UbeOeOf, conj(Uaf), force_sig);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache[2] = force_sig;
         }
 
         // Compute the force_rho --- contribution from the negative rho direction
-        Link Uab = Matrix_cache.load_z(0);
+        Link Uab = Matrix_cache[0];
         if constexpr (!sig_positive) Uab = conj(Uab);
         Link force_rho = arg.force(arg.rho, point_a, parity_a);
         force_rho = mm_add(mycoeff_seven * conj(UbeOeOf), conj(Uab), force_rho);
@@ -756,7 +758,7 @@ namespace quda {
         Link Ufe = arg.link(arg.sig, fe_link_nbr_idx, fe_link_nbr_parity);
 
         // Load Ube from the cache
-        Link Ube = Matrix_cache.load_z(1);
+        Link Ube = Matrix_cache[1];
 
         // Form the product UfeUebOb
         Link UfeUeb = (sig_positive ? Ufe : conj(Ufe)) * conj(Ube);
@@ -788,7 +790,7 @@ namespace quda {
         Link Oz = Ucb * Ob;
         Link Oy = (sig_positive ? Udc : conj(Udc)) * Oz;
         p5_sig = mm_add(arg.accumu_coeff_seven * conj(Uda), Oy, p5_sig);
-        Matrix_cache.save_z(p5_sig, 1);
+        Matrix_cache[1] = p5_sig;
 
         // When sig is positive, compute the force_sig contribution from the
         // positive rho direction
@@ -796,9 +798,9 @@ namespace quda {
           Link Od = arg.qNuMu(0, point_d, parity_d);
           Link Oc = arg.pNuMu(0, point_c, parity_c);
           Link Oz = conj(Ucb) * Oc;
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           force_sig = mm_add(mycoeff_seven * Oz, Od * Uda, force_sig);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache[2] = force_sig;
         }
 
       }
@@ -808,7 +810,7 @@ namespace quda {
         @param[in] x Local coordinate
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] parity_a Parity of the coordinate x
-        @param[in/out] Matrix_cache Shared memory cache that maintains the full P5 contribution
+        @param[in/out] Matrix_cache Thread local cache that maintains the full P5 contribution
                        summed from the previous middle five and all seven (read), as well as force_sig
                        when sig is positive (read/write)
         @details This subset of the code computes the side link five link contribution to the HISQ force.
@@ -818,7 +820,8 @@ namespace quda {
           Flops:
             2 multiplies, 2 adds, 2 rescales
       */
-      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, SharedMemoryCache<Link> &Matrix_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
         int y[4] = {x[0], x[1], x[2], x[3]};
         int point_h = updateCoordExtendedIndexShiftMILC<flip_dir(nu_positive)>(y, arg.nu, arg);
         int parity_h = 1 - parity_a;
@@ -832,7 +835,7 @@ namespace quda {
         int qh_link_nbr_idx = mu_positive ? point_q : point_h;
         int qh_link_nbr_parity = mu_positive ? parity_q : parity_h;
 
-        Link P5 = Matrix_cache.load_z(1);
+        Link P5 = Matrix_cache[1];
         Link Uah = arg.link(arg.nu, ha_link_nbr_idx, ha_link_nbr_parity);
         Link Ow = nu_positive ? Uah * P5 : conj(Uah) * P5;
 
@@ -857,7 +860,7 @@ namespace quda {
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
-        @param[in/out] Matrix_cache Helper shared memory cache that maintains  the gauge link going
+        @param[in/out] Matrix_cache Thread local cache that maintains the gauge link going
                        from a to b (read) and, when sig is positive, force_sig (read/write)
         @details This subset of the code computes the middle link five link contribution to the HISQ force.
           Data traffic:
@@ -870,8 +873,8 @@ namespace quda {
           Flops:
             1 multiply, 1 add, 1 rescale
       */
-      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a,
-          SharedMemoryCache<Link> &Matrix_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -902,7 +905,7 @@ namespace quda {
         arg.pNuMu_next(0, point_b, parity_b) = Ow;
         {
           // scoped Uab load
-          Link Uab = Matrix_cache.load_z(0);
+          Link Uab = Matrix_cache[0];
           if constexpr (!sig_positive) Uab = conj(Uab);
           arg.p5(0, point_a, parity_a) = Uab * Ow;
         }
@@ -917,9 +920,9 @@ namespace quda {
 
         // compute the force in the sigma direction if sig is positive
         if constexpr (sig_positive) {
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           force_sig = mm_add(arg.coeff_five * Ow, Ox, force_sig);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache[2] = force_sig;
         }
       }
 
@@ -955,14 +958,14 @@ namespace quda {
 
         int point_a = e_cb;
         int parity_a = parity;
-        
+
         // calculate p5_sig
-        auto block_dim = target::block_dim();
-        block_dim.z = (sig_positive ? 3 : 2);
-        SharedMemoryCache<Link> Matrix_cache(block_dim);
+	constexpr int cacheLen = sig_positive ? 3 : 2;
+        ThreadLocalCache<array<Link,cacheLen>> Matrix_cache{};
+
         if constexpr (sig_positive) {
           Link force_sig = arg.force(arg.sig, point_a, parity_a);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache[2] = force_sig;
         }
 
         // Scoped load of Uab
@@ -972,7 +975,7 @@ namespace quda {
           int ab_link_nbr_idx = (sig_positive) ? point_a : point_b;
           int ab_link_nbr_parity = (sig_positive) ? parity_a : parity_b;
           Link Uab = arg.link(arg.sig, ab_link_nbr_idx, ab_link_nbr_parity);
-          Matrix_cache.save_z(Uab, 0);
+          Matrix_cache[0] = Uab;
         }
 
         // accumulate into P5, force_sig
@@ -987,7 +990,7 @@ namespace quda {
 
         // update the force in the sigma direction
         if constexpr (sig_positive) {
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           arg.force(arg.sig, point_a, parity_a) = force_sig;
         }
 
diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h
new file mode 100644
index 0000000000..d730c3b4e7
--- /dev/null
+++ b/include/targets/cuda/thread_local_cache.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <target_device.h>
+
+/**
+   @file thread_local_cache.h
+
+   Thread local cache object which may use shared memory for optimization.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class for threads to store a unique value which can use
+     shared memory for optimization purposes.
+   */
+  template <typename T, typename O = void> class ThreadLocalCache
+  {
+  public:
+    using value_type = T;
+    using offset_type = O; // type of object that may also use shared memory at the same which is created before this one
+
+  private:
+    const unsigned int offset = 0; // dynamic offset in bytes
+
+    /**
+       @brief This is a dummy instantiation for the host compiler
+    */
+    template <bool, typename dummy = void> struct cache_dynamic {
+      T *operator()(unsigned)
+      {
+        static T *cache_;
+        return cache_;
+      }
+    };
+
+    /**
+       @brief This is the handle to the shared memory, dynamic specialization
+       @return Shared memory pointer
+     */
+    template <typename dummy> struct cache_dynamic<true, dummy> {
+      __device__ inline T *operator()(unsigned int offset)
+      {
+        extern __shared__ int cache_[];
+        return reinterpret_cast<T *>(reinterpret_cast<char *>(cache_) + offset);
+      }
+    };
+
+    __device__ __host__ inline T * cache() const
+    {
+      return target::dispatch<cache_dynamic>(offset);
+    }
+
+    __device__ __host__ inline void save_detail(const T &a) const
+    {
+      int j = target::thread_idx_linear<3>();
+      cache()[j] = a;
+    }
+
+    __device__ __host__ inline T &load_detail() const
+    {
+      int j = target::thread_idx_linear<3>();
+      return cache()[j];
+    }
+
+  public:
+    static constexpr unsigned int get_offset() {
+      if constexpr(std::is_same_v<O,void>) {
+	return 0;
+      } else {
+	return O::size();
+      }
+    }
+
+    static constexpr unsigned int size() {
+      return get_offset() + target::block_size<3>() * sizeof(T);
+    }
+
+    /**
+       @brief Constructor for ThreadLocalCache.
+    */
+    constexpr ThreadLocalCache() : offset(get_offset()) {}
+
+    /**
+       @brief Grab the raw base address to this cache.
+    */
+    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache()); }
+
+    /**
+       @brief Save the value into the thread local cache.
+       @param[in] a The value to store in the thread local cache
+     */
+    __device__ __host__ inline void save(const T &a) const
+    {
+      save_detail(a);
+    }
+
+    /**
+       @brief Load a value from the thread local cache
+       @return The value at the linear thread index
+     */
+    __device__ __host__ inline T &load() const
+    {
+      return load_detail();
+    }
+
+    /**
+       @brief Cast operator to allow cache objects to be used where T
+       is expected
+     */
+    __device__ __host__ operator T() const { return load(); }
+    //__device__ __host__ operator T() const { T a; return a; }
+
+    /**
+       @brief Assignment operator to allow cache objects to be used on
+       the lhs where T is otherwise expected.
+     */
+    __device__ __host__ void operator=(const T &src) const { save(src); }
+    //__device__ __host__ void operator=(const T &src) const { ; }
+
+    /**
+       @brief Subscripting operator returning reference to allow cache objects
+       to assign to a subscripted element.
+     */
+    __device__ __host__ auto& operator[](int i) { return load()[i]; }
+  };
+
+} // namespace quda
+
+// include overloads
+#include "../generic/thread_local_cache.h"
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
new file mode 100644
index 0000000000..f4af1b0a21
--- /dev/null
+++ b/include/targets/generic/thread_local_cache.h
@@ -0,0 +1,68 @@
+#include <shared_memory_cache_helper.h>
+
+/**
+   @file thread_local_cache_helper.h
+   @brief Convenience overloads to allow ThreadLocalCache objects to
+   appear in simple expressions.  The actual implementation of
+   ThreadLocalCache is target specific, and located in e.g.,
+   include/targets/cuda/thread_local_cache.h, etc.
+ */
+
+namespace quda
+{
+
+  template <typename T, typename O>
+  __device__ __host__ inline T operator+(const ThreadLocalCache<T, O> &a, const T &b)
+  {
+    return static_cast<const T &>(a) + b;
+  }
+
+  template <typename T, typename O>
+  __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, O> &b)
+  {
+    return a + static_cast<const T &>(b);
+  }
+
+  template <typename T, typename O>
+  __device__ __host__ inline T operator-(const ThreadLocalCache<T, O> &a, const T &b)
+  {
+    return static_cast<const T &>(a) - b;
+  }
+
+  template <typename T, typename O>
+  __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, O> &b)
+  {
+    return a - static_cast<const T &>(b);
+  }
+
+  template <typename T, typename O>
+  __device__ __host__ inline auto operator+=(ThreadLocalCache<T, O> &a, const T &b)
+  {
+    a.save(static_cast<const T &>(a) + b);
+    return a;
+  }
+
+  template <typename T, typename O>
+  __device__ __host__ inline auto operator-=(ThreadLocalCache<T, O> &a, const T &b)
+  {
+    a.save(static_cast<const T &>(a) - b);
+    return a;
+  }
+
+  template <typename T, typename O>
+  __device__ __host__ inline auto conj(const ThreadLocalCache<T, O> &a)
+  {
+    return conj(static_cast<const T &>(a));
+  }
+
+  /**
+     @brief Uniform helper for exposing type T, whether we are dealing
+     with an instance of T or ThreadLocalCache<T,O>
+   */
+  template <class T>
+  struct get_type<
+    T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, typename T::offset_type>>>> {
+    using type = typename T::value_type;
+  };
+
+} // namespace quda

From 8f297f1c7db49671f6d2e2eb6eb006cfaf42f2cc Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 18 May 2023 13:14:37 -0500
Subject: [PATCH 02/27] fix CI error

---
 include/targets/cuda/thread_local_cache.h    | 37 ++++++++------------
 include/targets/generic/thread_local_cache.h | 24 +++++--------
 2 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h
index d730c3b4e7..7b3a51c547 100644
--- a/include/targets/cuda/thread_local_cache.h
+++ b/include/targets/cuda/thread_local_cache.h
@@ -47,10 +47,7 @@ namespace quda
       }
     };
 
-    __device__ __host__ inline T * cache() const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
+    __device__ __host__ inline T *cache() const { return target::dispatch<cache_dynamic>(offset); }
 
     __device__ __host__ inline void save_detail(const T &a) const
     {
@@ -64,23 +61,23 @@ namespace quda
       return cache()[j];
     }
 
-  public:
-    static constexpr unsigned int get_offset() {
-      if constexpr(std::is_same_v<O,void>) {
-	return 0;
-      } else {
-	return O::size();
-      }
+    static constexpr unsigned int get_offset(dim3 block)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
+      return o;
     }
 
-    static constexpr unsigned int size() {
-      return get_offset() + target::block_size<3>() * sizeof(T);
+  public:
+    static constexpr unsigned int shared_mem_size(dim3 block)
+    {
+      return get_offset(block) + block.x * block.y * block.z * sizeof(T);
     }
 
     /**
        @brief Constructor for ThreadLocalCache.
     */
-    constexpr ThreadLocalCache() : offset(get_offset()) {}
+    constexpr ThreadLocalCache() : offset(get_offset(target::block_dim())) { }
 
     /**
        @brief Grab the raw base address to this cache.
@@ -91,19 +88,13 @@ namespace quda
        @brief Save the value into the thread local cache.
        @param[in] a The value to store in the thread local cache
      */
-    __device__ __host__ inline void save(const T &a) const
-    {
-      save_detail(a);
-    }
+    __device__ __host__ inline void save(const T &a) const { save_detail(a); }
 
     /**
        @brief Load a value from the thread local cache
        @return The value at the linear thread index
      */
-    __device__ __host__ inline T &load() const
-    {
-      return load_detail();
-    }
+    __device__ __host__ inline T &load() const { return load_detail(); }
 
     /**
        @brief Cast operator to allow cache objects to be used where T
@@ -123,7 +114,7 @@ namespace quda
        @brief Subscripting operator returning reference to allow cache objects
        to assign to a subscripted element.
      */
-    __device__ __host__ auto& operator[](int i) { return load()[i]; }
+    __device__ __host__ auto &operator[](int i) { return load()[i]; }
   };
 
 } // namespace quda
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index f4af1b0a21..bc9cbc2e7e 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -11,46 +11,39 @@
 namespace quda
 {
 
-  template <typename T, typename O>
-  __device__ __host__ inline T operator+(const ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, O> &a, const T &b)
   {
     return static_cast<const T &>(a) + b;
   }
 
-  template <typename T, typename O>
-  __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, O> &b)
+  template <typename T, typename O> __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, O> &b)
   {
     return a + static_cast<const T &>(b);
   }
 
-  template <typename T, typename O>
-  __device__ __host__ inline T operator-(const ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, typename O> __device__ __host__ inline T operator-(const ThreadLocalCache<T, O> &a, const T &b)
   {
     return static_cast<const T &>(a) - b;
   }
 
-  template <typename T, typename O>
-  __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, O> &b)
+  template <typename T, typename O> __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, O> &b)
   {
     return a - static_cast<const T &>(b);
   }
 
-  template <typename T, typename O>
-  __device__ __host__ inline auto operator+=(ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, typename O> __device__ __host__ inline auto operator+=(ThreadLocalCache<T, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) + b);
     return a;
   }
 
-  template <typename T, typename O>
-  __device__ __host__ inline auto operator-=(ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, typename O> __device__ __host__ inline auto operator-=(ThreadLocalCache<T, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) - b);
     return a;
   }
 
-  template <typename T, typename O>
-  __device__ __host__ inline auto conj(const ThreadLocalCache<T, O> &a)
+  template <typename T, typename O> __device__ __host__ inline auto conj(const ThreadLocalCache<T, O> &a)
   {
     return conj(static_cast<const T &>(a));
   }
@@ -60,8 +53,7 @@ namespace quda
      with an instance of T or ThreadLocalCache<T,O>
    */
   template <class T>
-  struct get_type<
-    T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, typename T::offset_type>>>> {
+  struct get_type<T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 

From 1a3614c9c484a375b2abd387aae6fd483dd62f37 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Mon, 22 May 2023 16:21:22 -0500
Subject: [PATCH 03/27] add coalescing to ThreadLocalCache

---
 include/targets/cuda/thread_local_cache.h | 31 +++++++++++++++++------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h
index 7b3a51c547..a0bb4a2340 100644
--- a/include/targets/cuda/thread_local_cache.h
+++ b/include/targets/cuda/thread_local_cache.h
@@ -22,15 +22,22 @@ namespace quda
     using offset_type = O; // type of object that may also use shared memory at the same which is created before this one
 
   private:
+    using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
+
+    // The number of elements of type atom_t that we break T into for optimal shared-memory access
+    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
+
+    const int stride;
     const unsigned int offset = 0; // dynamic offset in bytes
 
     /**
        @brief This is a dummy instantiation for the host compiler
     */
     template <bool, typename dummy = void> struct cache_dynamic {
-      T *operator()(unsigned)
+      atom_t *operator()(unsigned)
       {
-        static T *cache_;
+        static atom_t *cache_;
         return cache_;
       }
     };
@@ -40,25 +47,33 @@ namespace quda
        @return Shared memory pointer
      */
     template <typename dummy> struct cache_dynamic<true, dummy> {
-      __device__ inline T *operator()(unsigned int offset)
+      __device__ inline atom_t *operator()(unsigned int offset)
       {
         extern __shared__ int cache_[];
-        return reinterpret_cast<T *>(reinterpret_cast<char *>(cache_) + offset);
+        return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
       }
     };
 
-    __device__ __host__ inline T *cache() const { return target::dispatch<cache_dynamic>(offset); }
+    __device__ __host__ inline atom_t *cache() const { return target::dispatch<cache_dynamic>(offset); }
 
     __device__ __host__ inline void save_detail(const T &a) const
     {
+      atom_t tmp[n_element];
+      memcpy(tmp, (void *)&a, sizeof(T));
       int j = target::thread_idx_linear<3>();
-      cache()[j] = a;
+#pragma unroll
+      for (int i = 0; i < n_element; i++) cache()[i * stride + j] = tmp[i];
     }
 
     __device__ __host__ inline T &load_detail() const
     {
+      atom_t tmp[n_element];
       int j = target::thread_idx_linear<3>();
-      return cache()[j];
+#pragma unroll
+      for (int i = 0; i < n_element; i++) tmp[i] = cache()[i * stride + j];
+      T a;
+      memcpy((void *)&a, tmp, sizeof(T));
+      return a;
     }
 
     static constexpr unsigned int get_offset(dim3 block)
@@ -77,7 +92,7 @@ namespace quda
     /**
        @brief Constructor for ThreadLocalCache.
     */
-    constexpr ThreadLocalCache() : offset(get_offset(target::block_dim())) { }
+    constexpr ThreadLocalCache() : stride(target::block_size<3>()), offset(get_offset(target::block_dim())) { }
 
     /**
        @brief Grab the raw base address to this cache.

From c2235ee18d3beb4104217435e94a7354883af603 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Tue, 23 May 2023 16:15:26 -0500
Subject: [PATCH 04/27] don't return reference for ThreadLocalCache indexing

---
 include/kernels/gauge_stout.cuh              |  2 +-
 include/kernels/gauge_wilson_flow.cuh        |  2 +-
 include/kernels/hisq_paths_force.cuh         | 20 ++++----
 include/targets/cuda/thread_local_cache.h    | 50 +++++++++++++++-----
 include/targets/generic/thread_local_cache.h | 16 +++----
 5 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index ca22d0e9ac..56bd00f425 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -136,7 +136,7 @@ namespace quda
 
       Link U, Q;
       ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 1799d3c0dd..050295f271 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -73,7 +73,7 @@ namespace quda
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
       ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       break;
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index 522442da3b..bdba9d6bbe 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -415,7 +415,7 @@ namespace quda {
 
         Link Ow = mu_positive ? (conj(Ucb) * Oc) : (Ucb * Oc);
         {
-          Link Uab = Uab_cache.load();
+          Link Uab = Uab_cache;
           Link Oy = sig_positive ? Uab * Ow : conj(Uab) * Ow;
           Link Ox = mu_positive ? (Oy * Uid) : (Uid * conj(Oy));
           auto mycoeff_lepage = -coeff_sign<sig_positive, typename Arg::real>(parity_a)*coeff_sign<mu_positive, typename Arg::real>(parity_a)*arg.coeff_lepage;
@@ -489,7 +489,7 @@ namespace quda {
         arg.pMu_next(0, point_b, parity_b) = Oz;
         {
           // scoped Uab load
-          Link Uab = Uab_cache.load();
+          Link Uab = Uab_cache;
           if constexpr (!sig_positive) Uab = conj(Uab);
           arg.p3(0, point_a, parity_a) = Uab * Oz;
         }
@@ -737,7 +737,7 @@ namespace quda {
           UbeOeOf = Ube * OeOf;
 
           // Cache Ube to below
-          Matrix_cache[1] = Ube;
+          Matrix_cache.save(Ube, 1);
         }
 
         // Take care of force_sig --- contribution from the negative rho direction
@@ -745,7 +745,7 @@ namespace quda {
         if constexpr (sig_positive) {
           Link force_sig = Matrix_cache[2];
           force_sig = mm_add(mycoeff_seven * UbeOeOf, conj(Uaf), force_sig);
-          Matrix_cache[2] = force_sig;
+          Matrix_cache.save(force_sig, 2);
         }
 
         // Compute the force_rho --- contribution from the negative rho direction
@@ -790,7 +790,7 @@ namespace quda {
         Link Oz = Ucb * Ob;
         Link Oy = (sig_positive ? Udc : conj(Udc)) * Oz;
         p5_sig = mm_add(arg.accumu_coeff_seven * conj(Uda), Oy, p5_sig);
-        Matrix_cache[1] = p5_sig;
+        Matrix_cache.save(p5_sig, 1);
 
         // When sig is positive, compute the force_sig contribution from the
         // positive rho direction
@@ -800,7 +800,7 @@ namespace quda {
           Link Oz = conj(Ucb) * Oc;
           Link force_sig = Matrix_cache[2];
           force_sig = mm_add(mycoeff_seven * Oz, Od * Uda, force_sig);
-          Matrix_cache[2] = force_sig;
+          Matrix_cache.save(force_sig, 2);
         }
 
       }
@@ -922,7 +922,7 @@ namespace quda {
         if constexpr (sig_positive) {
           Link force_sig = Matrix_cache[2];
           force_sig = mm_add(arg.coeff_five * Ow, Ox, force_sig);
-          Matrix_cache[2] = force_sig;
+          Matrix_cache.save(force_sig, 2);
         }
       }
 
@@ -961,11 +961,11 @@ namespace quda {
 
         // calculate p5_sig
 	constexpr int cacheLen = sig_positive ? 3 : 2;
-        ThreadLocalCache<array<Link,cacheLen>> Matrix_cache{};
+        ThreadLocalCache<Link,cacheLen> Matrix_cache{};
 
         if constexpr (sig_positive) {
           Link force_sig = arg.force(arg.sig, point_a, parity_a);
-          Matrix_cache[2] = force_sig;
+          Matrix_cache.save(force_sig, 2);
         }
 
         // Scoped load of Uab
@@ -975,7 +975,7 @@ namespace quda {
           int ab_link_nbr_idx = (sig_positive) ? point_a : point_b;
           int ab_link_nbr_parity = (sig_positive) ? parity_a : parity_b;
           Link Uab = arg.link(arg.sig, ab_link_nbr_idx, ab_link_nbr_parity);
-          Matrix_cache[0] = Uab;
+          Matrix_cache.save(Uab, 0);
         }
 
         // accumulate into P5, force_sig
diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h
index a0bb4a2340..45eeca41da 100644
--- a/include/targets/cuda/thread_local_cache.h
+++ b/include/targets/cuda/thread_local_cache.h
@@ -15,11 +15,13 @@ namespace quda
      @brief Class for threads to store a unique value which can use
      shared memory for optimization purposes.
    */
-  template <typename T, typename O = void> class ThreadLocalCache
+  template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache
   {
   public:
     using value_type = T;
     using offset_type = O; // type of object that may also use shared memory at the same which is created before this one
+    static constexpr int N = N_;
+    static constexpr int len = std::max(1,N);
 
   private:
     using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
@@ -56,21 +58,21 @@ namespace quda
 
     __device__ __host__ inline atom_t *cache() const { return target::dispatch<cache_dynamic>(offset); }
 
-    __device__ __host__ inline void save_detail(const T &a) const
+    __device__ __host__ inline void save_detail(const T &a, const int k) const
     {
       atom_t tmp[n_element];
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) cache()[i * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) cache()[(k*n_element + i) * stride + j] = tmp[i];
     }
 
-    __device__ __host__ inline T &load_detail() const
+    __device__ __host__ inline T load_detail(const int k) const
     {
       atom_t tmp[n_element];
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = cache()[i * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = cache()[(k*n_element + i) * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -86,7 +88,7 @@ namespace quda
   public:
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
-      return get_offset(block) + block.x * block.y * block.z * sizeof(T);
+      return get_offset(block) + len * block.x * block.y * block.z * sizeof(T);
     }
 
     /**
@@ -103,33 +105,55 @@ namespace quda
        @brief Save the value into the thread local cache.
        @param[in] a The value to store in the thread local cache
      */
-    __device__ __host__ inline void save(const T &a) const { save_detail(a); }
+    __device__ __host__ inline void save(const T &a) const {
+      static_assert(N == 0);
+      save_detail(a, 0);
+    }
+
+    /**
+       @brief Save the value into the thread local cache.
+       @param[in] a The value to store in the thread local cache
+     */
+    __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); }
+
+    /**
+       @brief Load a value from the thread local cache
+       @return The value at the linear thread index
+     */
+    __device__ __host__ inline T load() const {
+      static_assert(N == 0);
+      return load_detail(0);
+    }
 
     /**
        @brief Load a value from the thread local cache
        @return The value at the linear thread index
      */
-    __device__ __host__ inline T &load() const { return load_detail(); }
+    __device__ __host__ inline T load(const int k) const { return load_detail(k); }
 
     /**
        @brief Cast operator to allow cache objects to be used where T
        is expected
      */
-    __device__ __host__ operator T() const { return load(); }
-    //__device__ __host__ operator T() const { T a; return a; }
+    __device__ __host__ operator T() const {
+      static_assert(N == 0);
+      return load(0);
+    }
 
     /**
        @brief Assignment operator to allow cache objects to be used on
        the lhs where T is otherwise expected.
      */
-    __device__ __host__ void operator=(const T &src) const { save(src); }
-    //__device__ __host__ void operator=(const T &src) const { ; }
+    __device__ __host__ void operator=(const T &src) const {
+      static_assert(N == 0);
+      save(src, 0);
+    }
 
     /**
        @brief Subscripting operator returning reference to allow cache objects
        to assign to a subscripted element.
      */
-    __device__ __host__ auto &operator[](int i) { return load()[i]; }
+    __device__ __host__ auto operator[](int i) { return load(i); }
   };
 
 } // namespace quda
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index bc9cbc2e7e..f2aa515801 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -11,39 +11,39 @@
 namespace quda
 {
 
-  template <typename T, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, int N, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
   {
     return static_cast<const T &>(a) + b;
   }
 
-  template <typename T, typename O> __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, O> &b)
+  template <typename T, int N, typename O> __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, N, O> &b)
   {
     return a + static_cast<const T &>(b);
   }
 
-  template <typename T, typename O> __device__ __host__ inline T operator-(const ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, int N, typename O> __device__ __host__ inline T operator-(const ThreadLocalCache<T, N, O> &a, const T &b)
   {
     return static_cast<const T &>(a) - b;
   }
 
-  template <typename T, typename O> __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, O> &b)
+  template <typename T, int N, typename O> __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, N, O> &b)
   {
     return a - static_cast<const T &>(b);
   }
 
-  template <typename T, typename O> __device__ __host__ inline auto operator+=(ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, int N, typename O> __device__ __host__ inline auto operator+=(ThreadLocalCache<T, N, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) + b);
     return a;
   }
 
-  template <typename T, typename O> __device__ __host__ inline auto operator-=(ThreadLocalCache<T, O> &a, const T &b)
+  template <typename T, int N, typename O> __device__ __host__ inline auto operator-=(ThreadLocalCache<T, N, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) - b);
     return a;
   }
 
-  template <typename T, typename O> __device__ __host__ inline auto conj(const ThreadLocalCache<T, O> &a)
+  template <typename T, int N, typename O> __device__ __host__ inline auto conj(const ThreadLocalCache<T, N, O> &a)
   {
     return conj(static_cast<const T &>(a));
   }
@@ -53,7 +53,7 @@ namespace quda
      with an instance of T or ThreadLocalCache<T,O>
    */
   template <class T>
-  struct get_type<T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, typename T::offset_type>>>> {
+  struct get_type<T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 

From f686285341056eada006a6e55246fad137308554 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Wed, 24 May 2023 13:02:54 -0500
Subject: [PATCH 05/27] update docs

---
 include/targets/cuda/thread_local_cache.h | 41 ++++++++++++-----------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h
index 45eeca41da..286f019a28 100644
--- a/include/targets/cuda/thread_local_cache.h
+++ b/include/targets/cuda/thread_local_cache.h
@@ -6,26 +6,27 @@
    @file thread_local_cache.h
 
    Thread local cache object which may use shared memory for optimization.
+   The storage can be a single object or an array of objects.
  */
 
 namespace quda
 {
 
   /**
-     @brief Class for threads to store a unique value which can use
+     @brief Class for threads to store a unique value, or array of values, which can use
      shared memory for optimization purposes.
    */
   template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache
   {
   public:
     using value_type = T;
-    using offset_type = O; // type of object that may also use shared memory at the same which is created before this one
-    static constexpr int N = N_;
-    static constexpr int len = std::max(1,N);
+    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
+    static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
+    static constexpr int len = std::max(1,N); // actual number of elements to store
 
   private:
     using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
+    static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types");
 
     // The number of elements of type atom_t that we break T into for optimal shared-memory access
     static constexpr int n_element = sizeof(T) / sizeof(atom_t);
@@ -102,7 +103,7 @@ namespace quda
     __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache()); }
 
     /**
-       @brief Save the value into the thread local cache.
+       @brief Save the value into the thread local cache.  Used when N==0 so cache acts like single object.
        @param[in] a The value to store in the thread local cache
      */
     __device__ __host__ inline void save(const T &a) const {
@@ -111,14 +112,15 @@ namespace quda
     }
 
     /**
-       @brief Save the value into the thread local cache.
+       @brief Save the value into an element of the thread local cache.
        @param[in] a The value to store in the thread local cache
+       @param[in] k The index to use
      */
     __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); }
 
     /**
-       @brief Load a value from the thread local cache
-       @return The value at the linear thread index
+       @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
+       @return The value stored in the thread local cache
      */
     __device__ __host__ inline T load() const {
       static_assert(N == 0);
@@ -126,34 +128,35 @@ namespace quda
     }
 
     /**
-       @brief Load a value from the thread local cache
-       @return The value at the linear thread index
+       @brief Load a value from an element of the thread local cache
+       @param[in] k The index to use
+       @return The value stored in the thread local cache at that index
      */
     __device__ __host__ inline T load(const int k) const { return load_detail(k); }
 
     /**
-       @brief Cast operator to allow cache objects to be used where T
-       is expected
+       @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
      */
     __device__ __host__ operator T() const {
       static_assert(N == 0);
-      return load(0);
+      return load();
     }
 
     /**
        @brief Assignment operator to allow cache objects to be used on
-       the lhs where T is otherwise expected.
+       the lhs where T is otherwise expected (when N==0).
      */
     __device__ __host__ void operator=(const T &src) const {
       static_assert(N == 0);
-      save(src, 0);
+      save(src);
     }
 
     /**
-       @brief Subscripting operator returning reference to allow cache objects
-       to assign to a subscripted element.
+       @brief Subscripting operator returning value at index for convenience.
+       @param[in] i The index to use
+       @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ auto operator[](int i) { return load(i); }
+    __device__ __host__ T operator[](int i) { return load(i); }
   };
 
 } // namespace quda

From b16b9b2bc7e86cf03fae82e6d4793d2b0692efc9 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Tue, 11 Jul 2023 13:09:16 -0500
Subject: [PATCH 06/27] add shared memory helper

---
 include/targets/cuda/shared_memory_helper.h  |  85 ++++++++++
 include/targets/cuda/thread_local_cache.h    | 164 -------------------
 include/targets/generic/thread_local_cache.h | 139 +++++++++++++++-
 3 files changed, 219 insertions(+), 169 deletions(-)
 create mode 100644 include/targets/cuda/shared_memory_helper.h

diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
new file mode 100644
index 0000000000..bd8d919359
--- /dev/null
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <target_device.h>
+
+/**
+   @file shared_memory_helper.h
+
+   Target specific helper for allocating and accessing shared memory.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class which is used to allocate and access shared memory.
+     The shared memory is treated as an array of type T, with the
+     number of elements given by the static member S::size().  The
+     offset from the beginning of the total shared memory block is
+     given by the static member O::shared_mem_size(block), or 0 if O
+     is void.
+   */
+  template <typename T, typename S, typename O = void> class SharedMemory
+  {
+  public:
+    using value_type = T;
+
+  private:
+    T *data;
+    const unsigned int size;  // number of elements of type T
+
+    /**
+       @brief This is a dummy instantiation for the host compiler
+    */
+    template <bool, typename dummy = void> struct cache_dynamic {
+      T *operator()(unsigned int)
+      {
+        static T *cache_;
+        return cache_;
+      }
+    };
+
+    /**
+       @brief This is the handle to the dynamic shared memory
+       @return Shared memory pointer
+     */
+    template <typename dummy> struct cache_dynamic<true, dummy> {
+      __device__ inline T *operator()(unsigned int offset)
+      {
+        extern __shared__ int cache_[];
+        return reinterpret_cast<T *>(reinterpret_cast<char *>(cache_) + offset);
+      }
+    };
+
+    __device__ __host__ inline T *cache(unsigned int offset) const
+    {
+      return target::dispatch<cache_dynamic>(offset);
+    }
+
+    static constexpr unsigned int get_offset(dim3 block)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
+      return o;
+    }
+
+  public:
+    static constexpr unsigned int shared_mem_size(dim3 block)
+    {
+      return get_offset(block) + S::size()*sizeof(T);
+    }
+
+    /**
+       @brief Constructor for SharedMemory object.
+    */
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {}
+
+    /**
+       @brief Subscripting operator returning a reference to element.
+       @param[in] i The index to use.
+       @return Reference to value stored at that index.
+     */
+    __device__ __host__ T &operator[](const int i) const { return data[i]; }
+  };
+
+} // namespace quda
diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h
index 286f019a28..dd4cd863fc 100644
--- a/include/targets/cuda/thread_local_cache.h
+++ b/include/targets/cuda/thread_local_cache.h
@@ -1,165 +1 @@
-#pragma once
-
-#include <target_device.h>
-
-/**
-   @file thread_local_cache.h
-
-   Thread local cache object which may use shared memory for optimization.
-   The storage can be a single object or an array of objects.
- */
-
-namespace quda
-{
-
-  /**
-     @brief Class for threads to store a unique value, or array of values, which can use
-     shared memory for optimization purposes.
-   */
-  template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache
-  {
-  public:
-    using value_type = T;
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
-    static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
-    static constexpr int len = std::max(1,N); // actual number of elements to store
-
-  private:
-    using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-    static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types");
-
-    // The number of elements of type atom_t that we break T into for optimal shared-memory access
-    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
-
-    const int stride;
-    const unsigned int offset = 0; // dynamic offset in bytes
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool, typename dummy = void> struct cache_dynamic {
-      atom_t *operator()(unsigned)
-      {
-        static atom_t *cache_;
-        return cache_;
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, dynamic specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_dynamic<true, dummy> {
-      __device__ inline atom_t *operator()(unsigned int offset)
-      {
-        extern __shared__ int cache_[];
-        return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
-      }
-    };
-
-    __device__ __host__ inline atom_t *cache() const { return target::dispatch<cache_dynamic>(offset); }
-
-    __device__ __host__ inline void save_detail(const T &a, const int k) const
-    {
-      atom_t tmp[n_element];
-      memcpy(tmp, (void *)&a, sizeof(T));
-      int j = target::thread_idx_linear<3>();
-#pragma unroll
-      for (int i = 0; i < n_element; i++) cache()[(k*n_element + i) * stride + j] = tmp[i];
-    }
-
-    __device__ __host__ inline T load_detail(const int k) const
-    {
-      atom_t tmp[n_element];
-      int j = target::thread_idx_linear<3>();
-#pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = cache()[(k*n_element + i) * stride + j];
-      T a;
-      memcpy((void *)&a, tmp, sizeof(T));
-      return a;
-    }
-
-    static constexpr unsigned int get_offset(dim3 block)
-    {
-      unsigned int o = 0;
-      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
-      return o;
-    }
-
-  public:
-    static constexpr unsigned int shared_mem_size(dim3 block)
-    {
-      return get_offset(block) + len * block.x * block.y * block.z * sizeof(T);
-    }
-
-    /**
-       @brief Constructor for ThreadLocalCache.
-    */
-    constexpr ThreadLocalCache() : stride(target::block_size<3>()), offset(get_offset(target::block_dim())) { }
-
-    /**
-       @brief Grab the raw base address to this cache.
-    */
-    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache()); }
-
-    /**
-       @brief Save the value into the thread local cache.  Used when N==0 so cache acts like single object.
-       @param[in] a The value to store in the thread local cache
-     */
-    __device__ __host__ inline void save(const T &a) const {
-      static_assert(N == 0);
-      save_detail(a, 0);
-    }
-
-    /**
-       @brief Save the value into an element of the thread local cache.
-       @param[in] a The value to store in the thread local cache
-       @param[in] k The index to use
-     */
-    __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); }
-
-    /**
-       @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
-       @return The value stored in the thread local cache
-     */
-    __device__ __host__ inline T load() const {
-      static_assert(N == 0);
-      return load_detail(0);
-    }
-
-    /**
-       @brief Load a value from an element of the thread local cache
-       @param[in] k The index to use
-       @return The value stored in the thread local cache at that index
-     */
-    __device__ __host__ inline T load(const int k) const { return load_detail(k); }
-
-    /**
-       @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
-     */
-    __device__ __host__ operator T() const {
-      static_assert(N == 0);
-      return load();
-    }
-
-    /**
-       @brief Assignment operator to allow cache objects to be used on
-       the lhs where T is otherwise expected (when N==0).
-     */
-    __device__ __host__ void operator=(const T &src) const {
-      static_assert(N == 0);
-      save(src);
-    }
-
-    /**
-       @brief Subscripting operator returning value at index for convenience.
-       @param[in] i The index to use
-       @return The value stored in the thread local cache at that index
-     */
-    __device__ __host__ T operator[](int i) { return load(i); }
-  };
-
-} // namespace quda
-
-// include overloads
 #include "../generic/thread_local_cache.h"
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index f2aa515801..8df61aebc4 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -1,16 +1,145 @@
+#pragma once
+
+#include <target_device.h>
+#include <shared_memory_helper.h>
 #include <shared_memory_cache_helper.h>
 
 /**
-   @file thread_local_cache_helper.h
-   @brief Convenience overloads to allow ThreadLocalCache objects to
-   appear in simple expressions.  The actual implementation of
-   ThreadLocalCache is target specific, and located in e.g.,
-   include/targets/cuda/thread_local_cache.h, etc.
+   @file thread_local_cache.h
+
+   Thread local cache object which may use shared memory for optimization.
+   The storage can be a single object or an array of objects.
  */
 
 namespace quda
 {
 
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
+  template <int N> struct SizeStatic {
+    static constexpr unsigned int size() { return N; }
+  };
+
+  /**
+     @brief Class for threads to store a unique value, or array of values, which can use
+     shared memory for optimization purposes.
+   */
+  template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache : SharedMemory<atom_t<T>, SizeStatic<N_>, O>
+  {
+  public:
+    using value_type = T;
+    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
+    static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
+    static constexpr int len = std::max(1,N); // actual number of elements to store
+    using Smem = SharedMemory<atom_t<T>, SizeStatic<N>, O>;
+
+  private:
+    using atom_t = atom_t<T>;
+    static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types");
+
+    // The number of elements of type atom_t that we break T into for optimal shared-memory access
+    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
+
+    const int stride;
+
+    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+
+    __device__ __host__ inline void save_detail(const T &a, const int k) const
+    {
+      atom_t tmp[n_element];
+      memcpy(tmp, (void *)&a, sizeof(T));
+      int j = target::thread_idx_linear<3>();
+#pragma unroll
+      for (int i = 0; i < n_element; i++) smem()[(k*n_element + i) * stride + j] = tmp[i];
+    }
+
+    __device__ __host__ inline T load_detail(const int k) const
+    {
+      atom_t tmp[n_element];
+      int j = target::thread_idx_linear<3>();
+#pragma unroll
+      for (int i = 0; i < n_element; i++) tmp[i] = smem()[(k*n_element + i) * stride + j];
+      T a;
+      memcpy((void *)&a, tmp, sizeof(T));
+      return a;
+    }
+
+    static constexpr unsigned int get_offset(dim3 block)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
+      return o;
+    }
+
+  public:
+    static constexpr unsigned int shared_mem_size(dim3 block)
+    {
+      return get_offset(block) + len * block.x * block.y * block.z * sizeof(T);
+    }
+
+    /**
+       @brief Constructor for ThreadLocalCache.
+    */
+    constexpr ThreadLocalCache() : stride(target::block_size<3>()) {}
+
+    /**
+       @brief Save the value into the thread local cache.  Used when N==0 so cache acts like single object.
+       @param[in] a The value to store in the thread local cache
+     */
+    __device__ __host__ inline void save(const T &a) const {
+      static_assert(N == 0);
+      save_detail(a, 0);
+    }
+
+    /**
+       @brief Save the value into an element of the thread local cache.
+       @param[in] a The value to store in the thread local cache
+       @param[in] k The index to use
+     */
+    __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); }
+
+    /**
+       @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
+       @return The value stored in the thread local cache
+     */
+    __device__ __host__ inline T load() const {
+      static_assert(N == 0);
+      return load_detail(0);
+    }
+
+    /**
+       @brief Load a value from an element of the thread local cache
+       @param[in] k The index to use
+       @return The value stored in the thread local cache at that index
+     */
+    __device__ __host__ inline T load(const int k) const { return load_detail(k); }
+
+    /**
+       @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
+     */
+    __device__ __host__ operator T() const {
+      static_assert(N == 0);
+      return load();
+    }
+
+    /**
+       @brief Assignment operator to allow cache objects to be used on
+       the lhs where T is otherwise expected (when N==0).
+     */
+    __device__ __host__ void operator=(const T &src) const {
+      static_assert(N == 0);
+      save(src);
+    }
+
+    /**
+       @brief Subscripting operator returning value at index for convenience.
+       @param[in] i The index to use
+       @return The value stored in the thread local cache at that index
+     */
+    __device__ __host__ T operator[](int i) { return load(i); }
+  };
+
   template <typename T, int N, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
   {
     return static_cast<const T &>(a) + b;

From d6ac933e38f1092a98e1c32f12f0c30b1c06fb28 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Tue, 11 Jul 2023 13:32:41 -0500
Subject: [PATCH 07/27] add shared memory helper to HIP

---
 include/targets/hip/shared_memory_helper.h | 85 ++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 include/targets/hip/shared_memory_helper.h

diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
new file mode 100644
index 0000000000..bd8d919359
--- /dev/null
+++ b/include/targets/hip/shared_memory_helper.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <target_device.h>
+
+/**
+   @file shared_memory_helper.h
+
+   Target specific helper for allocating and accessing shared memory.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class which is used to allocate and access shared memory.
+     The shared memory is treated as an array of type T, with the
+     number of elements given by the static member S::size().  The
+     offset from the beginning of the total shared memory block is
+     given by the static member O::shared_mem_size(block), or 0 if O
+     is void.
+   */
+  template <typename T, typename S, typename O = void> class SharedMemory
+  {
+  public:
+    using value_type = T;
+
+  private:
+    T *data;
+    const unsigned int size;  // number of elements of type T
+
+    /**
+       @brief This is a dummy instantiation for the host compiler
+    */
+    template <bool, typename dummy = void> struct cache_dynamic {
+      T *operator()(unsigned int)
+      {
+        static T *cache_;
+        return cache_;
+      }
+    };
+
+    /**
+       @brief This is the handle to the dynamic shared memory
+       @return Shared memory pointer
+     */
+    template <typename dummy> struct cache_dynamic<true, dummy> {
+      __device__ inline T *operator()(unsigned int offset)
+      {
+        extern __shared__ int cache_[];
+        return reinterpret_cast<T *>(reinterpret_cast<char *>(cache_) + offset);
+      }
+    };
+
+    __device__ __host__ inline T *cache(unsigned int offset) const
+    {
+      return target::dispatch<cache_dynamic>(offset);
+    }
+
+    static constexpr unsigned int get_offset(dim3 block)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
+      return o;
+    }
+
+  public:
+    static constexpr unsigned int shared_mem_size(dim3 block)
+    {
+      return get_offset(block) + S::size()*sizeof(T);
+    }
+
+    /**
+       @brief Constructor for SharedMemory object.
+    */
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {}
+
+    /**
+       @brief Subscripting operator returning a reference to element.
+       @param[in] i The index to use.
+       @return Reference to value stored at that index.
+     */
+    __device__ __host__ T &operator[](const int i) const { return data[i]; }
+  };
+
+} // namespace quda

From f29ff0049123b12f9ac74ba7acbfc9e5133c0723 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Tue, 8 Aug 2023 18:21:51 -0500
Subject: [PATCH 08/27] refactor SharedMemoryCache to use new SharedMemory
 object

---
 include/kernels/color_spinor_pack.cuh         |  16 +-
 include/kernels/dslash_clover_helper.cuh      |   2 +-
 include/kernels/dslash_coarse.cuh             |   2 +-
 include/kernels/dslash_domain_wall_m5.cuh     |  10 +-
 include/kernels/dslash_mobius_eofa.cuh        |   4 +-
 .../kernels/dslash_ndeg_twisted_clover.cuh    |   2 +-
 ...ash_ndeg_twisted_clover_preconditioned.cuh |   2 +-
 ...slash_ndeg_twisted_mass_preconditioned.cuh |   2 +-
 include/kernels/hisq_paths_force.cuh          |   4 +-
 .../targets/cuda/shared_memory_cache_helper.h | 294 ------------------
 include/targets/cuda/shared_memory_helper.h   |  15 +-
 include/targets/cuda/thread_array.h           |  52 +---
 include/targets/generic/helpers.h             |  42 +++
 .../generic/shared_memory_cache_helper.h      | 269 ++++++++++++++--
 include/targets/generic/thread_array.h        |  41 +++
 include/targets/generic/thread_local_cache.h  |  32 +-
 16 files changed, 386 insertions(+), 403 deletions(-)
 create mode 100644 include/targets/generic/helpers.h
 create mode 100644 include/targets/generic/thread_array.h

diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index 58ae6165d3..3d6ae5b6b2 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -166,6 +166,13 @@ namespace quda {
     }
   };
 
+  struct DimsPadX {
+    static constexpr dim3 dims(dim3 block) {
+      block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
+      return block;
+    }
+  };
+
   template <> struct site_max<true> {
     template <typename Arg> __device__ inline auto operator()(typename Arg::real thread_max, Arg &)
     {
@@ -173,11 +180,12 @@ namespace quda {
       constexpr int Ms = spins_per_thread<true, Arg::nSpin>();
       constexpr int Mc = colors_per_thread<true, Arg::nColor>();
       constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
-      auto block = target::block_dim();
+      //auto block = target::block_dim();
       // pad the shared block size to avoid bank conflicts
-      block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-      block.y = color_spin_threads; // state the y block since we know it at compile time
-      SharedMemoryCache<real> cache(block);
+      //block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
+      //block.y = color_spin_threads; // state the y block since we know it at compile time
+      //SharedMemoryCache<real> cache(block);
+      SharedMemoryCache<real, DimsPadX> cache;
       cache.save(thread_max);
       cache.sync();
       real this_site_max = static_cast<real>(0);
diff --git a/include/kernels/dslash_clover_helper.cuh b/include/kernels/dslash_clover_helper.cuh
index 00b61a9b3e..4cf6f1a311 100644
--- a/include/kernels/dslash_clover_helper.cuh
+++ b/include/kernels/dslash_clover_helper.cuh
@@ -203,7 +203,7 @@ namespace quda {
 
       Mat A = arg.clover(x_cb, clover_parity, chirality);
 
-      SharedMemoryCache<half_fermion> cache(target::block_dim());
+      SharedMemoryCache<half_fermion> cache;
 
       half_fermion in_chi[n_flavor]; // flavor array of chirally projected fermion
 #pragma unroll
diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh
index 9f90de7287..cd214d8fbd 100644
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -299,7 +299,7 @@ namespace quda {
   template <> struct dim_collapse<true> {
     template <typename T, typename Arg> __device__ __host__ inline void operator()(T &out, int dir, int dim, const Arg &arg)
     {
-      SharedMemoryCache<T> cache(target::block_dim());
+      SharedMemoryCache<T> cache;
       // only need to write to shared memory if not master thread
       if (dim > 0 || dir) cache.save(out);
 
diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh
index bab21d4c11..dfce18bede 100644
--- a/include/kernels/dslash_domain_wall_m5.cuh
+++ b/include/kernels/dslash_domain_wall_m5.cuh
@@ -220,7 +220,7 @@ namespace quda
     if (mobius_m5::use_half_vector()) {
       // if using shared-memory caching then load spinor field for my site into cache
       typedef ColorSpinor<real, Arg::nColor, 4 / 2> HalfVector;
-      SharedMemoryCache<HalfVector> cache(target::block_dim());
+      SharedMemoryCache<HalfVector> cache;
 
       { // forwards direction
         constexpr int proj_dir = dagger ? +1 : -1;
@@ -271,7 +271,7 @@ namespace quda
     } else { // use_half_vector
 
       // if using shared-memory caching then load spinor field for my site into cache
-      SharedMemoryCache<Vector> cache(target::block_dim());
+      SharedMemoryCache<Vector> cache;
       if (shared) {
         if (sync) { cache.sync(); }
         cache.save(in);
@@ -377,7 +377,7 @@ namespace quda
     const auto inv = arg.inv;
 
     // if using shared-memory caching then load spinor field for my site into cache
-    SharedMemoryCache<Vector> cache(target::block_dim());
+    SharedMemoryCache<Vector> cache;
     if (shared) {
       // cache.save(arg.in(s_ * arg.volume_4d_cb + x_cb, parity));
       if (sync) { cache.sync(); }
@@ -436,7 +436,7 @@ namespace quda
     Vector out;
 
     if (mobius_m5::use_half_vector()) {
-      SharedMemoryCache<HalfVector> cache(target::block_dim());
+      SharedMemoryCache<HalfVector> cache;
 
       { // first do R
         constexpr int proj_dir = dagger ? -1 : +1;
@@ -495,7 +495,7 @@ namespace quda
         out += l.reconstruct(4, proj_dir);
       }
     } else { // use_half_vector
-      SharedMemoryCache<Vector> cache(target::block_dim());
+      SharedMemoryCache<Vector> cache;
       if (shared) {
         if (sync) { cache.sync(); }
         cache.save(in);
diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh
index f5e0a5c8ac..3d62ec3923 100644
--- a/include/kernels/dslash_mobius_eofa.cuh
+++ b/include/kernels/dslash_mobius_eofa.cuh
@@ -107,7 +107,7 @@ namespace quda
         using real = typename Arg::real;
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
 
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
 
         Vector out;
         cache.save(arg.in(s * arg.volume_4d_cb + x_cb, parity));
@@ -185,7 +185,7 @@ namespace quda
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
 
         const auto sherman_morrison = arg.sherman_morrison;
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
         cache.save(arg.in(s * arg.volume_4d_cb + x_cb, parity));
         cache.sync();
 
diff --git a/include/kernels/dslash_ndeg_twisted_clover.cuh b/include/kernels/dslash_ndeg_twisted_clover.cuh
index 108f8c5e84..cb8bc61ad7 100644
--- a/include/kernels/dslash_ndeg_twisted_clover.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover.cuh
@@ -72,7 +72,7 @@ namespace quda
         // apply the chiral and flavor twists
         // use consistent load order across s to ensure better cache locality
         Vector x = arg.x(my_flavor_idx, my_spinor_parity);
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
         cache.save(x);
 
         x.toRel(); // switch to chiral basis
diff --git a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
index bdbff30817..ebd8f71da6 100644
--- a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
@@ -91,7 +91,7 @@ namespace quda
 
         int chirality = flavor; // relabel flavor as chirality
 
-        SharedMemoryCache<HalfVector> cache(target::block_dim());
+        SharedMemoryCache<HalfVector> cache;
 
         enum swizzle_direction {
           FORWARDS = 0,
diff --git a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
index 98e72eb61a..8bab3d5623 100644
--- a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
@@ -95,7 +95,7 @@ namespace quda
       }
       
       if (!dagger || Arg::asymmetric) { // apply A^{-1} to D*in
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
         if (isComplete<mykernel_type>(arg, coord) && active) {
           // to apply the preconditioner we need to put "out" in shared memory so the other flavor can access it
           cache.save(out);
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index bdba9d6bbe..84173a9a5a 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -559,7 +559,7 @@ namespace quda {
         int point_a = e_cb;
         int parity_a = parity;
 
-        ThreadLocalCache<Link> Uab_cache{};
+        ThreadLocalCache<Link> Uab_cache;
         // Scoped load of Uab
         {
           int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
@@ -961,7 +961,7 @@ namespace quda {
 
         // calculate p5_sig
 	constexpr int cacheLen = sig_positive ? 3 : 2;
-        ThreadLocalCache<Link,cacheLen> Matrix_cache{};
+        ThreadLocalCache<Link,cacheLen> Matrix_cache;
 
         if constexpr (sig_positive) {
           Link force_sig = arg.force(arg.sig, point_a, parity_a);
diff --git a/include/targets/cuda/shared_memory_cache_helper.h b/include/targets/cuda/shared_memory_cache_helper.h
index 7c7c0a1b28..73be0cd01b 100644
--- a/include/targets/cuda/shared_memory_cache_helper.h
+++ b/include/targets/cuda/shared_memory_cache_helper.h
@@ -1,295 +1 @@
-#pragma once
-
-#include <target_device.h>
-#include <array.h>
-
-/**
-   @file shared_memory_cache_helper.h
-
-   Helper functionality for aiding the use of the shared memory for
-   sharing data between threads in a thread block.
- */
-
-namespace quda
-{
-
-  /**
-     @brief Class which wraps around a shared memory cache for type T,
-     where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
-
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
-
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
-   */
-  template <typename T, int block_size_y_ = 1, int block_size_z_ = 1, bool dynamic_ = true> class SharedMemoryCache
-  {
-  public:
-    using value_type = T;
-    static constexpr int block_size_y = block_size_y_;
-    static constexpr int block_size_z = block_size_z_;
-    static constexpr bool dynamic = dynamic_;
-
-  private:
-    /** maximum number of threads in x given the y and z block sizes */
-    static constexpr int block_size_x = device::max_block_size<block_size_y, block_size_z>();
-
-    using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
-
-    // The number of elements of type atom_t that we break T into for optimal shared-memory access
-    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
-
-    const dim3 block;
-    const int stride;
-    const unsigned int offset = 0; // dynamic offset in bytes
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool, typename dummy = void> struct cache_dynamic {
-      atom_t *operator()(unsigned)
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, dynamic specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_dynamic<true, dummy> {
-      __device__ inline atom_t *operator()(unsigned int offset)
-      {
-        extern __shared__ int cache_[];
-        return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
-      }
-    };
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct cache_static {
-      atom_t *operator()()
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, static specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_static<true, dummy> {
-      __device__ inline atom_t *operator()()
-      {
-        static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z];
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<!dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_static>();
-    }
-
-    __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      memcpy(tmp, (void *)&a, sizeof(T));
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) cache<dynamic>()[i * stride + j] = tmp[i];
-    }
-
-    __device__ __host__ inline T load_detail(int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = cache<dynamic>()[i * stride + j];
-      T a;
-      memcpy((void *)&a, tmp, sizeof(T));
-      return a;
-    }
-
-    /**
-       @brief Dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct sync_impl {
-      void operator()() { }
-    };
-
-    /**
-       @brief Synchronize the cache when on the device
-    */
-    template <typename dummy> struct sync_impl<true, dummy> {
-      __device__ inline void operator()() { __syncthreads(); }
-    };
-
-  public:
-    /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
-    */
-    constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z),
-                                unsigned int thread_offset = 0) :
-      block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset)
-    {
-    }
-
-    /**
-       @brief Grab the raw base address to shared memory.
-    */
-    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache<dynamic>()); }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, x, y, z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-     */
-    __device__ __host__ inline void save_x(const T &a, int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      save_detail(a, x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] y The y index to use
-     */
-    __device__ __host__ inline void save_y(const T &a, int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      save_detail(a, tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save_z(const T &a, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Load a value from the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-     */
-    __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      return load_detail(x, y, z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] x The x index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_x(int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      return load_detail(x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] y The y index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_y(int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      return load_detail(tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_z(int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      return load_detail(tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Synchronize the cache
-    */
-    __device__ __host__ void sync() const { target::dispatch<sync_impl>(); }
-
-    /**
-       @brief Cast operator to allow cache objects to be used where T
-       is expected
-     */
-    __device__ __host__ operator T() const { return load(); }
-
-    /**
-       @brief Assignment operator to allow cache objects to be used on
-       the lhs where T is otherwise expected.
-     */
-    __device__ __host__ void operator=(const T &src) const { save(src); }
-  };
-
-} // namespace quda
-
-// include overloads
 #include "../generic/shared_memory_cache_helper.h"
diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
index bd8d919359..21a61cca45 100644
--- a/include/targets/cuda/shared_memory_helper.h
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -14,10 +14,10 @@ namespace quda
   /**
      @brief Class which is used to allocate and access shared memory.
      The shared memory is treated as an array of type T, with the
-     number of elements given by the static member S::size().  The
-     offset from the beginning of the total shared memory block is
-     given by the static member O::shared_mem_size(block), or 0 if O
-     is void.
+     number of elements given by the call to the static member
+     S::size(target::block_dim()).  The offset from the beginning of
+     the total shared memory block is given by the static member
+     O::shared_mem_size(target::block_dim()), or 0 if O is void.
    */
   template <typename T, typename S, typename O = void> class SharedMemory
   {
@@ -56,6 +56,7 @@ namespace quda
       return target::dispatch<cache_dynamic>(offset);
     }
 
+  public:
     static constexpr unsigned int get_offset(dim3 block)
     {
       unsigned int o = 0;
@@ -63,16 +64,16 @@ namespace quda
       return o;
     }
 
-  public:
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
-      return get_offset(block) + S::size()*sizeof(T);
+      return get_offset(block) + S::size(block)*sizeof(T);
     }
 
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
+			       size(S::size(target::block_dim())) {}
 
     /**
        @brief Subscripting operator returning a reference to element.
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index 4fe1bb33f6..cf778afbd5 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -1,49 +1,15 @@
 #pragma once
 
-#include "shared_memory_cache_helper.h"
+//#ifndef _NVHPC_CUDA
 
-namespace quda
-{
-
-#ifndef _NVHPC_CUDA
-
-  /**
-     @brief Class that provides indexable per-thread storage.  On CUDA
-     this maps to using assigning each thread a unique window of
-     shared memory using the SharedMemoryCache object.
-   */
-  template <typename T, int n> struct thread_array {
-    SharedMemoryCache<array<T, n>, 1, 1, false> device_array;
-    int offset;
-    array<T, n> host_array;
-    array<T, n> &array_;
-
-    __device__ __host__ constexpr thread_array() :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n>(); // call default constructor
-    }
+//#include "../generic/thread_array.h"
 
-    template <typename... Ts>
-    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n> {first, other...};
-    }
+//#else
 
-    __device__ __host__ T &operator[](int i) { return array_[i]; }
-    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
-  };
-
-#else
-
-  template <typename T, int n> struct thread_array : array<T, n> {
-  };
-
-#endif
+#include <array.h>
+namespace quda
+{
+  template <typename T, int n> struct thread_array : array<T, n> {};
+}
 
-} // namespace quda
+//#endif
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
new file mode 100644
index 0000000000..fe90c0e5ed
--- /dev/null
+++ b/include/targets/generic/helpers.h
@@ -0,0 +1,42 @@
+#pragma once
+
+namespace quda
+{
+
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
+  template <int N> struct SizeStatic {
+    static constexpr unsigned int size(dim3 block) {
+      return N;
+    }
+  };
+
+  template <int N> struct SizePerThread {
+    static constexpr unsigned int size(dim3 block) {
+      return N * block.x * block.y * block.z;
+    }
+  };
+
+  template <typename D, int N = 1> struct SizeDims {
+    static constexpr unsigned int size(dim3 block) {
+      dim3 dims = D::dims(block);
+      return dims.x * dims.y * dims.z * N;
+    }
+  };
+
+  struct DimsBlock {
+    static constexpr dim3 dims(dim3 block) {
+      return block;
+    }
+  };
+
+  /**
+     @brief Uniform helper for exposing type T, whether we are dealing
+     with an instance of T or some wrapper of T
+   */
+  template <class T, class enable = void> struct get_type {
+    using type = T;
+  };
+
+}
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index 060e3478f8..ddf831e6a4 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -1,3 +1,16 @@
+#pragma once
+
+#include <helpers.h>
+#include <target_device.h>
+#include <shared_memory_helper.h>
+
+/**
+   @file shared_memory_cache_helper.h
+
+   Helper functionality for aiding the use of the shared memory for
+   sharing data between threads in a thread block.
+ */
+
 /**
    @file shared_memory_cache_helper.h
    @brief Convenience overloads to allow SharedMemoryCache objects to
@@ -9,46 +22,269 @@
 namespace quda
 {
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator+(const SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  /**
+     @brief Class which wraps around a shared memory cache for type T,
+     where each thread in the thread block stores a unique value in
+     the cache which any other thread can access.
+
+     This accessor supports both explicit run-time block size and
+     compile-time sizing.
+
+     * For run-time block size, the constructor should be initialied
+       with the desired block size.
+
+     * For compile-time block size, no arguments should be passed to
+       the constructor, and then the second and third template
+       parameters correspond to the y and z dimensions of the block,
+       respectively.  The x dimension of the block will be set
+       according the maximum number of threads possible, given these
+       dimensions.
+   */
+  template <typename T, typename D = DimsBlock, typename O = void>
+  class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>
+  {
+  public:
+    using value_type = T;
+    using dims_type = D;
+    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
+    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+
+  private:
+    using atom_t = atom_t<T>;
+    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
+
+    // The number of elements of type atom_t that we break T into for optimal shared-memory access
+    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
+
+    const dim3 block;
+    const int stride;
+
+    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+
+    __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
+    {
+      atom_t tmp[n_element];
+      memcpy(tmp, (void *)&a, sizeof(T));
+      int j = (z * block.y + y) * block.x + x;
+#pragma unroll
+      for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i];
+    }
+
+    __device__ __host__ inline T load_detail(int x, int y, int z) const
+    {
+      atom_t tmp[n_element];
+      int j = (z * block.y + y) * block.x + x;
+#pragma unroll
+      for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j];
+      T a;
+      memcpy((void *)&a, tmp, sizeof(T));
+      return a;
+    }
+
+    /**
+       @brief Dummy instantiation for the host compiler
+    */
+    template <bool is_device, typename dummy = void> struct sync_impl {
+      void operator()() { }
+    };
+
+    /**
+       @brief Synchronize the cache when on the device
+    */
+    template <typename dummy> struct sync_impl<true, dummy> {
+      __device__ inline void operator()() { __syncthreads(); }
+    };
+
+  public:
+    using Smem::shared_mem_size;
+
+    /**
+       @brief constructor for SharedMemory cache.  If no arguments are
+       pass, then the dimensions are set according to the templates
+       block_size_y and block_size_z, together with the derived
+       block_size_x.  Otherwise use the block sizes passed into the
+       constructor.
+
+       @param[in] block Block dimensions for the 3-d shared memory object
+       @param[in] thread_offset "Perceived" offset from dynamic shared
+       memory base pointer (used when we have multiple caches in
+       scope).  Need to include block size to actual offset.
+    */
+    constexpr SharedMemoryCache() :
+      block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
+    {
+      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims<D>::size(dim3{8,8,8})*sizeof(T));
+    }
+
+    /**
+       @brief Grab the raw base address to shared memory.
+    */
+    __device__ __host__ inline auto data() const {
+      return reinterpret_cast<T *>(&smem()[0]);
+    }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] x The x index to use
+       @param[in] y The y index to use
+       @param[in] z The z index to use
+     */
+    __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      y = (y == -1) ? tid.y : y;
+      z = (z == -1) ? tid.z : z;
+      save_detail(a, x, y, z);
+    }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] x The x index to use
+     */
+    __device__ __host__ inline void save_x(const T &a, int x = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      save_detail(a, x, tid.y, tid.z);
+    }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] y The y index to use
+     */
+    __device__ __host__ inline void save_y(const T &a, int y = -1) const
+    {
+      auto tid = target::thread_idx();
+      y = (y == -1) ? tid.y : y;
+      save_detail(a, tid.x, y, tid.z);
+    }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] z The z index to use
+     */
+    __device__ __host__ inline void save_z(const T &a, int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      z = (z == -1) ? tid.z : z;
+      save_detail(a, tid.x, tid.y, z);
+    }
+
+    /**
+       @brief Load a value from the shared memory cache
+       @param[in] x The x index to use
+       @param[in] y The y index to use
+       @param[in] z The z index to use
+       @return The value at coordinates (x,y,z)
+     */
+    __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      y = (y == -1) ? tid.y : y;
+      z = (z == -1) ? tid.z : z;
+      return load_detail(x, y, z);
+    }
+
+    /**
+       @brief Load a vector from the shared memory cache
+       @param[in] x The x index to use
+       @return The value at coordinates (x,y,z)
+    */
+    __device__ __host__ inline T load_x(int x = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      return load_detail(x, tid.y, tid.z);
+    }
+
+    /**
+       @brief Load a vector from the shared memory cache
+       @param[in] y The y index to use
+       @return The value at coordinates (x,y,z)
+    */
+    __device__ __host__ inline T load_y(int y = -1) const
+    {
+      auto tid = target::thread_idx();
+      y = (y == -1) ? tid.y : y;
+      return load_detail(tid.x, y, tid.z);
+    }
+
+    /**
+       @brief Load a vector from the shared memory cache
+       @param[in] z The z index to use
+       @return The value at coordinates (x,y,z)
+    */
+    __device__ __host__ inline T load_z(int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      z = (z == -1) ? tid.z : z;
+      return load_detail(tid.x, tid.y, z);
+    }
+
+    /**
+       @brief Synchronize the cache
+    */
+    __device__ __host__ void sync() const { target::dispatch<sync_impl>(); }
+
+    /**
+       @brief Cast operator to allow cache objects to be used where T
+       is expected
+     */
+    __device__ __host__ operator T() const { return load(); }
+
+    /**
+       @brief Assignment operator to allow cache objects to be used on
+       the lhs where T is otherwise expected.
+     */
+    __device__ __host__ void operator=(const T &src) const { save(src); }
+  };
+
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator+(const SharedMemoryCache<T, D, O> &a, const T &b)
   {
     return static_cast<const T &>(a) + b;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator+(const T &a, const SharedMemoryCache<T, by, bz, dynamic> &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator+(const T &a, const SharedMemoryCache<T, D, O> &b)
   {
     return a + static_cast<const T &>(b);
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator-(const SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator-(const SharedMemoryCache<T, D, O> &a, const T &b)
   {
     return static_cast<const T &>(a) - b;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator-(const T &a, const SharedMemoryCache<T, by, bz, dynamic> &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator-(const T &a, const SharedMemoryCache<T, D, O> &b)
   {
     return a - static_cast<const T &>(b);
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline auto operator+=(SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline auto operator+=(SharedMemoryCache<T, D, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) + b);
     return a;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline auto operator-=(SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline auto operator-=(SharedMemoryCache<T, D, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) - b);
     return a;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline auto conj(const SharedMemoryCache<T, by, bz, dynamic> &a)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline auto conj(const SharedMemoryCache<T, D, O> &a)
   {
     return conj(static_cast<const T &>(a));
   }
@@ -57,12 +293,9 @@ namespace quda
      @brief Uniform helper for exposing type T, whether we are dealing
      with an instance of T or SharedMemoryCache<T>
    */
-  template <class T, class enable = void> struct get_type {
-    using type = T;
-  };
   template <class T>
   struct get_type<
-    T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, T::block_size_y, T::block_size_z, T::dynamic>>>> {
+    T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
new file mode 100644
index 0000000000..3bbb739964
--- /dev/null
+++ b/include/targets/generic/thread_array.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <helpers.h>
+#include <shared_memory_helper.h>
+#include <array.h>
+
+namespace quda
+{
+
+  /**
+     @brief Class that provides indexable per-thread storage.  On CUDA
+     this maps to using assigning each thread a unique window of
+     shared memory using the SharedMemoryCache object.
+   */
+  template <typename T, int n, typename O = void>
+  class thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O>
+  {
+    int offset;
+    using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
+    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+    array<T,n> &data() const { return smem()[offset]; }
+
+  public:
+    __device__ __host__ constexpr thread_array()
+    {
+      offset = target::thread_idx_linear<3>();
+      data() = array<T, n>(); // call default constructor
+    }
+
+    template <typename... Ts>
+    __device__ __host__ constexpr thread_array(T first, const Ts... other)
+    {
+      offset = target::thread_idx_linear<3>();
+      data() = array<T, n> {first, other...};
+    }
+
+    __device__ __host__ T &operator[](int i) { return data()[i]; }
+    __device__ __host__ const T &operator[](int i) const { return data()[i]; }
+  };
+
+} // namespace quda
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index 8df61aebc4..3b14564bec 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -1,8 +1,8 @@
 #pragma once
 
+#include <helpers.h>
 #include <target_device.h>
 #include <shared_memory_helper.h>
-#include <shared_memory_cache_helper.h>
 
 /**
    @file thread_local_cache.h
@@ -14,25 +14,19 @@
 namespace quda
 {
 
-  template <typename T>
-  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-
-  template <int N> struct SizeStatic {
-    static constexpr unsigned int size() { return N; }
-  };
-
   /**
      @brief Class for threads to store a unique value, or array of values, which can use
      shared memory for optimization purposes.
    */
-  template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache : SharedMemory<atom_t<T>, SizeStatic<N_>, O>
+  template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache :
+    SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>
   {
   public:
     using value_type = T;
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
     static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
+    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
     static constexpr int len = std::max(1,N); // actual number of elements to store
-    using Smem = SharedMemory<atom_t<T>, SizeStatic<N>, O>;
+    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
 
   private:
     using atom_t = atom_t<T>;
@@ -65,23 +59,15 @@ namespace quda
       return a;
     }
 
-    static constexpr unsigned int get_offset(dim3 block)
-    {
-      unsigned int o = 0;
-      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
-      return o;
-    }
-
   public:
-    static constexpr unsigned int shared_mem_size(dim3 block)
-    {
-      return get_offset(block) + len * block.x * block.y * block.z * sizeof(T);
-    }
+    using Smem::shared_mem_size;
 
     /**
        @brief Constructor for ThreadLocalCache.
     */
-    constexpr ThreadLocalCache() : stride(target::block_size<3>()) {}
+    constexpr ThreadLocalCache() : stride(target::block_size<3>()) {
+      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizePerThread<len>::size(dim3{8,8,8})*sizeof(T));
+    }
 
     /**
        @brief Save the value into the thread local cache.  Used when N==0 so cache acts like single object.

From f2419e14a1b0185690582cac62f5154971d0e469 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Tue, 8 Aug 2023 23:30:34 -0500
Subject: [PATCH 09/27] remove unused parameter

---
 include/targets/cuda/thread_array.h | 42 ++++++++++++++++++++++++++---
 include/targets/generic/helpers.h   |  2 +-
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index cf778afbd5..c0c7b36929 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -1,10 +1,46 @@
 #pragma once
 
-//#ifndef _NVHPC_CUDA
+#ifndef _NVHPC_CUDA
 
 //#include "../generic/thread_array.h"
+#include <array.h>
+//#include <shared_memory_cache_helper.h>
+#include <shared_memory_helper.h>
+#include <helpers.h>
+
+namespace quda
+{
+  template <typename T, int n, typename O = void>
+  struct thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O> {
+    int offset;
+    array<T, n> host_array;
+    array<T, n> &array_;
+    using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
+    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+
+    __device__ __host__ constexpr thread_array() :
+      offset(target::thread_idx_linear<3>()),
+      array_(*(&smem()[0] + offset))
+      //array_(target::is_device() ? *(&smem()[0] + offset) : host_array)
+    {
+      array_ = array<T, n>(); // call default constructor
+    }
+
+    template <typename... Ts>
+    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
+      offset(target::thread_idx_linear<3>()),
+      array_(*(&smem()[0] + offset))
+      //array_(target::is_device() ? *(&smem()[0] + offset) : host_array)
+    {
+      array_ = array<T, n> {first, other...};
+    }
+
+    __device__ __host__ T &operator[](int i) { return array_[i]; }
+    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
+  };
+}
 
-//#else
+#else
 
 #include <array.h>
 namespace quda
@@ -12,4 +48,4 @@ namespace quda
   template <typename T, int n> struct thread_array : array<T, n> {};
 }
 
-//#endif
+#endif
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
index fe90c0e5ed..4c35347e60 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/helpers.h
@@ -7,7 +7,7 @@ namespace quda
   using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
 
   template <int N> struct SizeStatic {
-    static constexpr unsigned int size(dim3 block) {
+    static constexpr unsigned int size(dim3) {
       return N;
     }
   };

From dd7b975844874c34bf0a41988ceb3d401e4856c9 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Fri, 11 Aug 2023 21:08:16 -0500
Subject: [PATCH 10/27] update thread_array

---
 include/targets/cuda/thread_array.h    | 38 +-------------------------
 include/targets/generic/thread_array.h | 19 ++++++-------
 2 files changed, 10 insertions(+), 47 deletions(-)

diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index c0c7b36929..f5e965576d 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -2,43 +2,7 @@
 
 #ifndef _NVHPC_CUDA
 
-//#include "../generic/thread_array.h"
-#include <array.h>
-//#include <shared_memory_cache_helper.h>
-#include <shared_memory_helper.h>
-#include <helpers.h>
-
-namespace quda
-{
-  template <typename T, int n, typename O = void>
-  struct thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O> {
-    int offset;
-    array<T, n> host_array;
-    array<T, n> &array_;
-    using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
-    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-
-    __device__ __host__ constexpr thread_array() :
-      offset(target::thread_idx_linear<3>()),
-      array_(*(&smem()[0] + offset))
-      //array_(target::is_device() ? *(&smem()[0] + offset) : host_array)
-    {
-      array_ = array<T, n>(); // call default constructor
-    }
-
-    template <typename... Ts>
-    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
-      offset(target::thread_idx_linear<3>()),
-      array_(*(&smem()[0] + offset))
-      //array_(target::is_device() ? *(&smem()[0] + offset) : host_array)
-    {
-      array_ = array<T, n> {first, other...};
-    }
-
-    __device__ __host__ T &operator[](int i) { return array_[i]; }
-    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
-  };
-}
+#include "../generic/thread_array.h"
 
 #else
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index 3bbb739964..5bbb243391 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -15,27 +15,26 @@ namespace quda
   template <typename T, int n, typename O = void>
   class thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O>
   {
-    int offset;
     using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
     constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    array<T,n> &data() const { return smem()[offset]; }
+    array<T, n> &array_;
 
   public:
-    __device__ __host__ constexpr thread_array()
+    __device__ __host__ constexpr thread_array() :
+      array_(smem()[target::thread_idx_linear<3>()])
     {
-      offset = target::thread_idx_linear<3>();
-      data() = array<T, n>(); // call default constructor
+      array_ = array<T, n>(); // call default constructor
     }
 
     template <typename... Ts>
-    __device__ __host__ constexpr thread_array(T first, const Ts... other)
+    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
+      array_(smem()[target::thread_idx_linear<3>()])
     {
-      offset = target::thread_idx_linear<3>();
-      data() = array<T, n> {first, other...};
+      array_ = array<T, n> {first, other...};
     }
 
-    __device__ __host__ T &operator[](int i) { return data()[i]; }
-    __device__ __host__ const T &operator[](int i) const { return data()[i]; }
+    __device__ __host__ T &operator[](int i) { return array_[i]; }
+    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
   };
 
 } // namespace quda

From 72cc52b9df373b08e008c769757f8cfccc3d8f83 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Wed, 16 Aug 2023 10:10:47 -0500
Subject: [PATCH 11/27] update remaining shared memory uses

---
 include/kernels/block_transpose.cuh           | 11 ++++++-
 include/kernels/coarse_op_kernel.cuh          | 18 +++++++++---
 include/kernels/color_spinor_pack.cuh         | 29 +++++++++++--------
 include/targets/generic/helpers.h             |  7 +++++
 .../generic/shared_memory_cache_helper.h      | 21 ++++++++++----
 5 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
index 153c2c9695..2b41f5c520 100644
--- a/include/kernels/block_transpose.cuh
+++ b/include/kernels/block_transpose.cuh
@@ -47,6 +47,14 @@ namespace quda
     constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
+    struct Dims {
+      static constexpr dim3 dims(dim3 block) {
+	block.x += 1;
+	block.z = 1;
+	return block;
+      }
+    };
+
     /**
       @brief Transpose between the two different orders of batched colorspinor fields:
         - B: nVec -> spatial/N -> spin/color -> N, where N is for that in floatN
@@ -60,7 +68,8 @@ namespace quda
       int parity = parity_color / Arg::nColor;
       using color_spinor_t = ColorSpinor<typename Arg::real, 1, Arg::nSpin>;
 
-      SharedMemoryCache<color_spinor_t> cache({target::block_dim().x + 1, target::block_dim().y, 1});
+      //SharedMemoryCache<color_spinor_t> cache({target::block_dim().x + 1, target::block_dim().y, 1});
+      SharedMemoryCache<color_spinor_t, Dims> cache;
 
       int x_offset = target::block_dim().x * target::block_idx().x;
       int v_offset = target::block_dim().y * target::block_idx().y;
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index b63bf7f435..be411eb68c 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -10,6 +10,7 @@
 #include <matrix_tile.cuh>
 #include <target_device.h>
 #include <kernel.h>
+#include <shared_memory_cache_helper.h>
 
 namespace quda {
 
@@ -1387,14 +1388,21 @@ namespace quda {
   };
 
   template <> struct storeCoarseSharedAtomic_impl<true> {
+    template <typename Arg> using CacheT =
+      complex<storeType>[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
+    template <typename Arg> using Cache = SharedMemoryCache<CacheT<Arg>,DimsStatic<2,1,1>>;
+
     template <typename VUV, typename Pack, typename Arg>
     inline __device__ void operator()(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity, int i0, int j0, int parity, const Pack &pack, const Arg &arg)
     {
       using real = typename Arg::Float;
       using TileType = typename Arg::vuvTileType;
       const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
-      __shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-      __shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
+      //__shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
+      //__shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
+      Cache<Arg> cache;
+      auto &X = cache.data()[0];
+      auto &Y = cache.data()[1];
 
       int x_ = coarse_x_cb % arg.aggregates_per_block;
       int tx = virtualThreadIdx(arg);
@@ -1416,7 +1424,8 @@ namespace quda {
         }
       }
 
-      __syncthreads();
+      //__syncthreads();
+      cache.sync();
 
 #pragma unroll
       for (int i = 0; i < TileType::M; i++) {
@@ -1445,7 +1454,8 @@ namespace quda {
         }
       }
 
-      __syncthreads();
+      //__syncthreads();
+      cache.sync();
 
       if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {
 
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index 0f4c2b4466..2cc489181a 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -171,27 +171,32 @@ namespace quda {
     }
   };
 
-  template <bool is_native>
-  struct DimsPadX {
-    static constexpr dim3 dims(dim3 block) {
-      if (is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-      return block;
-    }
-  };
-
   template <> struct site_max<true> {
+    template <typename Arg>
+    struct DimsPadX {
+      static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
+      static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
+      static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
+      static constexpr dim3 dims(dim3 block) {
+	if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
+	block.y = color_spin_threads; // state the y block since we know it at compile time
+	return block;
+      }
+    };
+
     template <typename Arg> __device__ inline auto operator()(typename Arg::real thread_max, Arg &)
     {
       using real = typename Arg::real;
-      constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
-      constexpr int Mc = colors_per_thread<true>(Arg::nColor);
-      constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
+      //constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
+      //constexpr int Mc = colors_per_thread<true>(Arg::nColor);
+      //constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
+      constexpr int color_spin_threads = DimsPadX<Arg>::color_spin_threads;
       //auto block = target::block_dim();
       // pad the shared block size to avoid bank conflicts for native ordering
       //if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
       //block.y = color_spin_threads; // state the y block since we know it at compile time
       //SharedMemoryCache<real> cache(block);
-      SharedMemoryCache<real, DimsPadX<Arg::is_native>> cache;
+      SharedMemoryCache<real, DimsPadX<Arg>> cache;
       cache.save(thread_max);
       cache.sync();
       real this_site_max = static_cast<real>(0);
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
index 4c35347e60..2ab4efbda4 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/helpers.h
@@ -31,6 +31,13 @@ namespace quda
     }
   };
 
+  template <int x, int y, int z>
+  struct DimsStatic {
+    static constexpr dim3 dims(dim3 block) {
+      return dim3(x,y,z);
+    }
+  };
+
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
      with an instance of T or some wrapper of T
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index ddf831e6a4..f9b7eccd77 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -56,6 +56,9 @@ namespace quda
     // The number of elements of type atom_t that we break T into for optimal shared-memory access
     static constexpr int n_element = sizeof(T) / sizeof(atom_t);
 
+    // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
+    template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy,void>,T,void>;
+
     const dim3 block;
     const int stride;
 
@@ -70,7 +73,8 @@ namespace quda
       for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i];
     }
 
-    __device__ __host__ inline T load_detail(int x, int y, int z) const
+    template <typename dummy = void>
+    __device__ __host__ inline maybeT<dummy> load_detail(int x, int y, int z) const
     {
       atom_t tmp[n_element];
       int j = (z * block.y + y) * block.x + x;
@@ -182,7 +186,8 @@ namespace quda
        @param[in] z The z index to use
        @return The value at coordinates (x,y,z)
      */
-    __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
+    template <typename dummy = void>
+    __device__ __host__ inline maybeT<dummy> load(int x = -1, int y = -1, int z = -1) const
     {
       auto tid = target::thread_idx();
       x = (x == -1) ? tid.x : x;
@@ -196,7 +201,8 @@ namespace quda
        @param[in] x The x index to use
        @return The value at coordinates (x,y,z)
     */
-    __device__ __host__ inline T load_x(int x = -1) const
+    template <typename dummy = void>
+    __device__ __host__ inline maybeT<dummy> load_x(int x = -1) const
     {
       auto tid = target::thread_idx();
       x = (x == -1) ? tid.x : x;
@@ -208,7 +214,8 @@ namespace quda
        @param[in] y The y index to use
        @return The value at coordinates (x,y,z)
     */
-    __device__ __host__ inline T load_y(int y = -1) const
+    template <typename dummy = void>
+    __device__ __host__ inline maybeT<dummy> load_y(int y = -1) const
     {
       auto tid = target::thread_idx();
       y = (y == -1) ? tid.y : y;
@@ -220,7 +227,8 @@ namespace quda
        @param[in] z The z index to use
        @return The value at coordinates (x,y,z)
     */
-    __device__ __host__ inline T load_z(int z = -1) const
+    template <typename dummy = void>
+    __device__ __host__ inline maybeT<dummy> load_z(int z = -1) const
     {
       auto tid = target::thread_idx();
       z = (z == -1) ? tid.z : z;
@@ -236,7 +244,8 @@ namespace quda
        @brief Cast operator to allow cache objects to be used where T
        is expected
      */
-    __device__ __host__ operator T() const { return load(); }
+    template <typename dummy = void>
+    __device__ __host__ operator maybeT<dummy>() const { return load(); }
 
     /**
        @brief Assignment operator to allow cache objects to be used on

From 13f6701af4c7d27108ecfa6224923f49ad94691b Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Wed, 16 Aug 2023 12:21:07 -0500
Subject: [PATCH 12/27] fix HIP target

---
 .../targets/hip/shared_memory_cache_helper.h  | 294 ------------------
 include/targets/hip/thread_array.h            |  41 +--
 2 files changed, 1 insertion(+), 334 deletions(-)

diff --git a/include/targets/hip/shared_memory_cache_helper.h b/include/targets/hip/shared_memory_cache_helper.h
index 7c7c0a1b28..73be0cd01b 100644
--- a/include/targets/hip/shared_memory_cache_helper.h
+++ b/include/targets/hip/shared_memory_cache_helper.h
@@ -1,295 +1 @@
-#pragma once
-
-#include <target_device.h>
-#include <array.h>
-
-/**
-   @file shared_memory_cache_helper.h
-
-   Helper functionality for aiding the use of the shared memory for
-   sharing data between threads in a thread block.
- */
-
-namespace quda
-{
-
-  /**
-     @brief Class which wraps around a shared memory cache for type T,
-     where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
-
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
-
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
-   */
-  template <typename T, int block_size_y_ = 1, int block_size_z_ = 1, bool dynamic_ = true> class SharedMemoryCache
-  {
-  public:
-    using value_type = T;
-    static constexpr int block_size_y = block_size_y_;
-    static constexpr int block_size_z = block_size_z_;
-    static constexpr bool dynamic = dynamic_;
-
-  private:
-    /** maximum number of threads in x given the y and z block sizes */
-    static constexpr int block_size_x = device::max_block_size<block_size_y, block_size_z>();
-
-    using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
-
-    // The number of elements of type atom_t that we break T into for optimal shared-memory access
-    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
-
-    const dim3 block;
-    const int stride;
-    const unsigned int offset = 0; // dynamic offset in bytes
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool, typename dummy = void> struct cache_dynamic {
-      atom_t *operator()(unsigned)
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, dynamic specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_dynamic<true, dummy> {
-      __device__ inline atom_t *operator()(unsigned int offset)
-      {
-        extern __shared__ int cache_[];
-        return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
-      }
-    };
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct cache_static {
-      atom_t *operator()()
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, static specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_static<true, dummy> {
-      __device__ inline atom_t *operator()()
-      {
-        static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z];
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<!dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_static>();
-    }
-
-    __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      memcpy(tmp, (void *)&a, sizeof(T));
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) cache<dynamic>()[i * stride + j] = tmp[i];
-    }
-
-    __device__ __host__ inline T load_detail(int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = cache<dynamic>()[i * stride + j];
-      T a;
-      memcpy((void *)&a, tmp, sizeof(T));
-      return a;
-    }
-
-    /**
-       @brief Dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct sync_impl {
-      void operator()() { }
-    };
-
-    /**
-       @brief Synchronize the cache when on the device
-    */
-    template <typename dummy> struct sync_impl<true, dummy> {
-      __device__ inline void operator()() { __syncthreads(); }
-    };
-
-  public:
-    /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
-    */
-    constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z),
-                                unsigned int thread_offset = 0) :
-      block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset)
-    {
-    }
-
-    /**
-       @brief Grab the raw base address to shared memory.
-    */
-    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache<dynamic>()); }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, x, y, z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-     */
-    __device__ __host__ inline void save_x(const T &a, int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      save_detail(a, x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] y The y index to use
-     */
-    __device__ __host__ inline void save_y(const T &a, int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      save_detail(a, tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save_z(const T &a, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Load a value from the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-     */
-    __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      return load_detail(x, y, z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] x The x index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_x(int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      return load_detail(x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] y The y index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_y(int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      return load_detail(tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_z(int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      return load_detail(tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Synchronize the cache
-    */
-    __device__ __host__ void sync() const { target::dispatch<sync_impl>(); }
-
-    /**
-       @brief Cast operator to allow cache objects to be used where T
-       is expected
-     */
-    __device__ __host__ operator T() const { return load(); }
-
-    /**
-       @brief Assignment operator to allow cache objects to be used on
-       the lhs where T is otherwise expected.
-     */
-    __device__ __host__ void operator=(const T &src) const { save(src); }
-  };
-
-} // namespace quda
-
-// include overloads
 #include "../generic/shared_memory_cache_helper.h"
diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h
index 77751adf1b..24f986cc26 100644
--- a/include/targets/hip/thread_array.h
+++ b/include/targets/hip/thread_array.h
@@ -1,40 +1 @@
-#pragma once
-
-#include "shared_memory_cache_helper.h"
-
-namespace quda
-{
-
-  /**
-     @brief Class that provides indexable per-thread storage.  On HIP
-     this maps to using assigning each thread a unique window of
-     shared memory using the SharedMemoryCache object.
-   */
-  template <typename T, int n> struct thread_array {
-    SharedMemoryCache<array<T, n>, 1, 1, false> device_array;
-    int offset;
-    array<T, n> host_array;
-    array<T, n> &array_;
-
-    __device__ __host__ constexpr thread_array() :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n>(); // call default constructor
-    }
-
-    template <typename... Ts>
-    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n> {first, other...};
-    }
-
-    __device__ __host__ T &operator[](int i) { return array_[i]; }
-    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
-  };
-
-} // namespace quda
+#include "../generic/thread_array.h"

From 7df0d93027934b73b1978b301b25877ab5dd18f5 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Wed, 16 Aug 2023 12:40:02 -0500
Subject: [PATCH 13/27] fix HIP build

---
 include/targets/hip/shared_memory_helper.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index bd8d919359..21a61cca45 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -14,10 +14,10 @@ namespace quda
   /**
      @brief Class which is used to allocate and access shared memory.
      The shared memory is treated as an array of type T, with the
-     number of elements given by the static member S::size().  The
-     offset from the beginning of the total shared memory block is
-     given by the static member O::shared_mem_size(block), or 0 if O
-     is void.
+     number of elements given by the call to the static member
+     S::size(target::block_dim()).  The offset from the beginning of
+     the total shared memory block is given by the static member
+     O::shared_mem_size(target::block_dim()), or 0 if O is void.
    */
   template <typename T, typename S, typename O = void> class SharedMemory
   {
@@ -56,6 +56,7 @@ namespace quda
       return target::dispatch<cache_dynamic>(offset);
     }
 
+  public:
     static constexpr unsigned int get_offset(dim3 block)
     {
       unsigned int o = 0;
@@ -63,16 +64,16 @@ namespace quda
       return o;
     }
 
-  public:
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
-      return get_offset(block) + S::size()*sizeof(T);
+      return get_offset(block) + S::size(block)*sizeof(T);
     }
 
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), size(S::size()) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
+			       size(S::size(target::block_dim())) {}
 
     /**
        @brief Subscripting operator returning a reference to element.

From c73d2b4ae4b7a001aa7e782bf6ccaeb90c9a6f36 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Fri, 18 Aug 2023 22:41:07 -0500
Subject: [PATCH 14/27] update shared memory object

---
 include/targets/cuda/shared_memory_helper.h          | 2 ++
 include/targets/generic/shared_memory_cache_helper.h | 8 +++++---
 include/targets/generic/thread_array.h               | 4 +++-
 include/targets/generic/thread_local_cache.h         | 6 +++---
 include/targets/hip/shared_memory_helper.h           | 2 ++
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
index 21a61cca45..b7dacbb9cf 100644
--- a/include/targets/cuda/shared_memory_helper.h
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -75,6 +75,8 @@ namespace quda
     constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
 			       size(S::size(target::block_dim())) {}
 
+    constexpr auto smem() const { return *this; }
+
     /**
        @brief Subscripting operator returning a reference to element.
        @param[in] i The index to use.
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index f9b7eccd77..149a7016f3 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -48,6 +48,7 @@ namespace quda
     using dims_type = D;
     using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
     using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using Smem::shared_mem_size;
 
   private:
     using atom_t = atom_t<T>;
@@ -62,7 +63,10 @@ namespace quda
     const dim3 block;
     const int stride;
 
-    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+    using Smem::smem;
+    //constexpr Smem smem() const { return Smem::smem(); }
+    //constexpr Smem smem() const { return *Smem::smemp(); }
 
     __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
     {
@@ -100,8 +104,6 @@ namespace quda
     };
 
   public:
-    using Smem::shared_mem_size;
-
     /**
        @brief constructor for SharedMemory cache.  If no arguments are
        pass, then the dimensions are set according to the templates
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index 5bbb243391..583468c036 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -16,9 +16,11 @@ namespace quda
   class thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O>
   {
     using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
-    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
     array<T, n> &array_;
 
+    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+    using Smem::smem;
+
   public:
     __device__ __host__ constexpr thread_array() :
       array_(smem()[target::thread_idx_linear<3>()])
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index 3b14564bec..545a365e25 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -27,6 +27,7 @@ namespace quda
     using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
     static constexpr int len = std::max(1,N); // actual number of elements to store
     using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using Smem::shared_mem_size;
 
   private:
     using atom_t = atom_t<T>;
@@ -37,7 +38,8 @@ namespace quda
 
     const int stride;
 
-    constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
+    using Smem::smem;
 
     __device__ __host__ inline void save_detail(const T &a, const int k) const
     {
@@ -60,8 +62,6 @@ namespace quda
     }
 
   public:
-    using Smem::shared_mem_size;
-
     /**
        @brief Constructor for ThreadLocalCache.
     */
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index 21a61cca45..b7dacbb9cf 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -75,6 +75,8 @@ namespace quda
     constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
 			       size(S::size(target::block_dim())) {}
 
+    constexpr auto smem() const { return *this; }
+
     /**
        @brief Subscripting operator returning a reference to element.
        @param[in] i The index to use.

From bab726ebc838f6c94bb514960665e0e984bd6072 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Sat, 19 Aug 2023 14:29:51 -0500
Subject: [PATCH 15/27] fix clang build

---
 include/targets/generic/helpers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
index 2ab4efbda4..a08cefa84b 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/helpers.h
@@ -33,7 +33,7 @@ namespace quda
 
   template <int x, int y, int z>
   struct DimsStatic {
-    static constexpr dim3 dims(dim3 block) {
+    static constexpr dim3 dims(dim3) {
       return dim3(x,y,z);
     }
   };

From b57658280e970f79e28870b1a1bf1ad5e1e620e1 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 31 Aug 2023 22:02:38 -0500
Subject: [PATCH 16/27] code cleanup

---
 include/kernels/block_transpose.cuh           |  5 +-
 include/kernels/coarse_op_kernel.cuh          |  4 --
 include/kernels/color_spinor_pack.cuh         | 16 ++---
 include/kernels/gauge_stout.cuh               |  4 +-
 include/kernels/gauge_utils.cuh               |  1 -
 include/kernels/gauge_wilson_flow.cuh         |  4 +-
 include/kernels/hisq_paths_force.cuh          |  1 -
 include/targets/cuda/shared_memory_helper.h   | 21 ++++--
 include/targets/cuda/thread_array.h           |  1 +
 include/targets/generic/helpers.h             | 18 +++++
 .../generic/shared_memory_cache_helper.h      | 69 ++++++-------------
 include/targets/generic/thread_array.h        | 15 ++--
 include/targets/generic/thread_local_cache.h  | 39 +++++++----
 include/targets/hip/shared_memory_helper.h    | 21 ++++--
 14 files changed, 110 insertions(+), 109 deletions(-)

diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
index 2b41f5c520..3c54345a6c 100644
--- a/include/kernels/block_transpose.cuh
+++ b/include/kernels/block_transpose.cuh
@@ -47,7 +47,7 @@ namespace quda
     constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    struct Dims {
+    struct CacheDims {
       static constexpr dim3 dims(dim3 block) {
 	block.x += 1;
 	block.z = 1;
@@ -68,8 +68,7 @@ namespace quda
       int parity = parity_color / Arg::nColor;
       using color_spinor_t = ColorSpinor<typename Arg::real, 1, Arg::nSpin>;
 
-      //SharedMemoryCache<color_spinor_t> cache({target::block_dim().x + 1, target::block_dim().y, 1});
-      SharedMemoryCache<color_spinor_t, Dims> cache;
+      SharedMemoryCache<color_spinor_t, CacheDims> cache;
 
       int x_offset = target::block_dim().x * target::block_idx().x;
       int v_offset = target::block_dim().y * target::block_idx().y;
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index be411eb68c..04c05fda4e 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -1398,8 +1398,6 @@ namespace quda {
       using real = typename Arg::Float;
       using TileType = typename Arg::vuvTileType;
       const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
-      //__shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-      //__shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
       Cache<Arg> cache;
       auto &X = cache.data()[0];
       auto &Y = cache.data()[1];
@@ -1424,7 +1422,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
 #pragma unroll
@@ -1454,7 +1451,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
       if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index 2cc489181a..a67d86ded2 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -172,12 +172,12 @@ namespace quda {
   };
 
   template <> struct site_max<true> {
-    template <typename Arg>
-    struct DimsPadX {
+    template <typename Arg> struct CacheDims {
       static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
       static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
       static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
       static constexpr dim3 dims(dim3 block) {
+	// pad the shared block size to avoid bank conflicts for native ordering
 	if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
 	block.y = color_spin_threads; // state the y block since we know it at compile time
 	return block;
@@ -187,16 +187,8 @@ namespace quda {
     template <typename Arg> __device__ inline auto operator()(typename Arg::real thread_max, Arg &)
     {
       using real = typename Arg::real;
-      //constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
-      //constexpr int Mc = colors_per_thread<true>(Arg::nColor);
-      //constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
-      constexpr int color_spin_threads = DimsPadX<Arg>::color_spin_threads;
-      //auto block = target::block_dim();
-      // pad the shared block size to avoid bank conflicts for native ordering
-      //if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-      //block.y = color_spin_threads; // state the y block since we know it at compile time
-      //SharedMemoryCache<real> cache(block);
-      SharedMemoryCache<real, DimsPadX<Arg>> cache;
+      constexpr int color_spin_threads = CacheDims<Arg>::color_spin_threads;
+      SharedMemoryCache<real, CacheDims<Arg>> cache;
       cache.save(thread_max);
       cache.sync();
       real this_site_max = static_cast<real>(0);
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 56bd00f425..4577e66fcd 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -135,8 +135,8 @@ namespace quda
       }
 
       Link U, Q;
-      ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh
index 6e91e1ac31..48c7e6c1cc 100644
--- a/include/kernels/gauge_utils.cuh
+++ b/include/kernels/gauge_utils.cuh
@@ -2,7 +2,6 @@
 #include <index_helper.cuh>
 #include <quda_matrix.h>
 #include <thread_array.h>
-#include <thread_local_cache.h>
 
 namespace quda
 {
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 050295f271..ae28956112 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -72,8 +72,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       break;
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index 84173a9a5a..35ddde688d 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -538,7 +538,6 @@ namespace quda {
          * The "extra" low point corresponds to the Lepage contribution to the
          * force_mu term.
          *
-         *
          *            sig
          *         F        E
          *          |      |
diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
index b7dacbb9cf..2da596f6ed 100644
--- a/include/targets/cuda/shared_memory_helper.h
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Class which is used to allocate and access shared memory.
      The shared memory is treated as an array of type T, with the
-     number of elements given by the call to the static member
-     S::size(target::block_dim()).  The offset from the beginning of
-     the total shared memory block is given by the static member
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
      O::shared_mem_size(target::block_dim()), or 0 if O is void.
    */
   template <typename T, typename S, typename O = void> class SharedMemory
@@ -26,7 +26,6 @@ namespace quda
 
   private:
     T *data;
-    const unsigned int size;  // number of elements of type T
 
     /**
        @brief This is a dummy instantiation for the host compiler
@@ -57,6 +56,9 @@ namespace quda
     }
 
   public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
     static constexpr unsigned int get_offset(dim3 block)
     {
       unsigned int o = 0;
@@ -64,6 +66,9 @@ namespace quda
       return o;
     }
 
+    /**
+       @brief Shared memory size in bytes.
+    */
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
       return get_offset(block) + S::size(block)*sizeof(T);
@@ -72,10 +77,12 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
-			       size(S::size(target::block_dim())) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}
 
-    constexpr auto smem() const { return *this; }
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
 
     /**
        @brief Subscripting operator returning a reference to element.
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index f5e965576d..c88a178e2c 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -7,6 +7,7 @@
 #else
 
 #include <array.h>
+
 namespace quda
 {
   template <typename T, int n> struct thread_array : array<T, n> {};
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
index a08cefa84b..f8faf41b44 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/helpers.h
@@ -3,21 +3,33 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
   template <typename T>
   using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
 
+  /**
+     @brief Used to declare an object of fixed size.
+   */
   template <int N> struct SizeStatic {
     static constexpr unsigned int size(dim3) {
       return N;
     }
   };
 
+  /**
+     @brief Used to declare an object of fixed size per thread, N.
+   */
   template <int N> struct SizePerThread {
     static constexpr unsigned int size(dim3 block) {
       return N * block.x * block.y * block.z;
     }
   };
 
+  /**
+     @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D.
+   */
   template <typename D, int N = 1> struct SizeDims {
     static constexpr unsigned int size(dim3 block) {
       dim3 dims = D::dims(block);
@@ -25,12 +37,18 @@ namespace quda
     }
   };
 
+  /**
+     @brief Used to declare an object with dimensions given by the block size.
+   */
   struct DimsBlock {
     static constexpr dim3 dims(dim3 block) {
       return block;
     }
   };
 
+  /**
+     @brief Used to declare an object with fixed dimensions.
+   */
   template <int x, int y, int z>
   struct DimsStatic {
     static constexpr dim3 dims(dim3) {
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index 149a7016f3..1866375f76 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -11,46 +11,38 @@
    sharing data between threads in a thread block.
  */
 
-/**
-   @file shared_memory_cache_helper.h
-   @brief Convenience overloads to allow SharedMemoryCache objects to
-   appear in simple expressions.  The actual implementation of
-   SharedMemoryCache is target specific, and located in e.g.,
-   include/targets/cuda/shared_memory_cache_helper.h, etc.
- */
-
 namespace quda
 {
 
   /**
      @brief Class which wraps around a shared memory cache for type T,
      where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
+     the cache which any other thread can access.  The data is stored
+     in a coalesced order with element size atom_t<T>.
 
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
+     The dimensions of the cache is determined by a call to
+     D::dims(target::block_dim()), and D defaults to having dimensions
+     equal to the block dimensions.
 
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
+     A byte offset into the shared memory region can be specified with
+     the type O, and is given by
+     O::shared_mem_size(target::block_dim()) if O is not void.
    */
   template <typename T, typename D = DimsBlock, typename O = void>
   class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>
   {
+    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+
   public:
     using value_type = T;
     using dims_type = D;
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
-    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using offset_type = O;
     using Smem::shared_mem_size;
 
   private:
+    const dim3 block;
+    const int stride;
+    using Smem::sharedMem;
     using atom_t = atom_t<T>;
     static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
 
@@ -60,21 +52,13 @@ namespace quda
     // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
     template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy,void>,T,void>;
 
-    const dim3 block;
-    const int stride;
-
-    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    using Smem::smem;
-    //constexpr Smem smem() const { return Smem::smem(); }
-    //constexpr Smem smem() const { return *Smem::smemp(); }
-
     __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
     {
       atom_t tmp[n_element];
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = (z * block.y + y) * block.x + x;
 #pragma unroll
-      for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i];
     }
 
     template <typename dummy = void>
@@ -83,7 +67,7 @@ namespace quda
       atom_t tmp[n_element];
       int j = (z * block.y + y) * block.x + x;
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[i * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -105,28 +89,20 @@ namespace quda
 
   public:
     /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
+       @brief Constructor for SharedMemoryCache.
     */
     constexpr SharedMemoryCache() :
       block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
     {
-      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims<D>::size(dim3{8,8,8})*sizeof(T));
+      // sanity check
+      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims<D>::size(dim3{32,16,8})*sizeof(T));
     }
 
     /**
        @brief Grab the raw base address to shared memory.
     */
     __device__ __host__ inline auto data() const {
-      return reinterpret_cast<T *>(&smem()[0]);
+      return reinterpret_cast<T *>(&sharedMem()[0]);
     }
 
     /**
@@ -302,11 +278,10 @@ namespace quda
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or SharedMemoryCache<T>
+     with an instance of T or SharedMemoryCache<T,D,O>
    */
   template <class T>
-  struct get_type<
-    T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
+  struct get_type<T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index 583468c036..5325e5ab8f 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -8,29 +8,28 @@ namespace quda
 {
 
   /**
-     @brief Class that provides indexable per-thread storage.  On CUDA
-     this maps to using assigning each thread a unique window of
-     shared memory using the SharedMemoryCache object.
+     @brief Class that provides indexable per-thread storage for n
+     elements of type T.  This version uses shared memory for storage.
+     The offset into the shared memory region is determined from the
+     type O.
    */
   template <typename T, int n, typename O = void>
   class thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O>
   {
     using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
+    using Smem::sharedMem;
     array<T, n> &array_;
 
-    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    using Smem::smem;
-
   public:
     __device__ __host__ constexpr thread_array() :
-      array_(smem()[target::thread_idx_linear<3>()])
+      array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       array_ = array<T, n>(); // call default constructor
     }
 
     template <typename... Ts>
     __device__ __host__ constexpr thread_array(T first, const Ts... other) :
-      array_(smem()[target::thread_idx_linear<3>()])
+      array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       array_ = array<T, n> {first, other...};
     }
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index 545a365e25..277b16903c 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -15,39 +15,38 @@ namespace quda
 {
 
   /**
-     @brief Class for threads to store a unique value, or array of values, which can use
-     shared memory for optimization purposes.
+     @brief Class for threads to store a unique value (for N_ == 0),
+     or array of values (for N_ > 0), which can use shared memory for
+     optimization purposes.
    */
   template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache :
     SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>
   {
+    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
+
   public:
     using value_type = T;
     static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
     using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
     static constexpr int len = std::max(1,N); // actual number of elements to store
-    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
     using Smem::shared_mem_size;
 
   private:
+    const int stride;
+    using Smem::sharedMem;
     using atom_t = atom_t<T>;
     static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types");
 
     // The number of elements of type atom_t that we break T into for optimal shared-memory access
     static constexpr int n_element = sizeof(T) / sizeof(atom_t);
 
-    const int stride;
-
-    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    using Smem::smem;
-
     __device__ __host__ inline void save_detail(const T &a, const int k) const
     {
       atom_t tmp[n_element];
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) smem()[(k*n_element + i) * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) sharedMem()[(k*n_element + i) * stride + j] = tmp[i];
     }
 
     __device__ __host__ inline T load_detail(const int k) const
@@ -55,7 +54,7 @@ namespace quda
       atom_t tmp[n_element];
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = smem()[(k*n_element + i) * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k*n_element + i) * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -66,7 +65,8 @@ namespace quda
        @brief Constructor for ThreadLocalCache.
     */
     constexpr ThreadLocalCache() : stride(target::block_size<3>()) {
-      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizePerThread<len>::size(dim3{8,8,8})*sizeof(T));
+      // sanity check
+      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizePerThread<len>::size(dim3{32,16,8})*sizeof(T));
     }
 
     /**
@@ -83,7 +83,10 @@ namespace quda
        @param[in] a The value to store in the thread local cache
        @param[in] k The index to use
      */
-    __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); }
+    __device__ __host__ inline void save(const T &a, const int k) const {
+      static_assert(N > 0);
+      save_detail(a, k);
+    }
 
     /**
        @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
@@ -99,7 +102,10 @@ namespace quda
        @param[in] k The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ inline T load(const int k) const { return load_detail(k); }
+    __device__ __host__ inline T load(const int k) const {
+      static_assert(N > 0);
+      return load_detail(k);
+    }
 
     /**
        @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
@@ -123,7 +129,10 @@ namespace quda
        @param[in] i The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ T operator[](int i) { return load(i); }
+    __device__ __host__ T operator[](int i) {
+      static_assert(N > 0);
+      return load(i);
+    }
   };
 
   template <typename T, int N, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
@@ -165,7 +174,7 @@ namespace quda
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or ThreadLocalCache<T,O>
+     with an instance of T or ThreadLocalCache<T,N,O>
    */
   template <class T>
   struct get_type<T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index b7dacbb9cf..2da596f6ed 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Class which is used to allocate and access shared memory.
      The shared memory is treated as an array of type T, with the
-     number of elements given by the call to the static member
-     S::size(target::block_dim()).  The offset from the beginning of
-     the total shared memory block is given by the static member
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
      O::shared_mem_size(target::block_dim()), or 0 if O is void.
    */
   template <typename T, typename S, typename O = void> class SharedMemory
@@ -26,7 +26,6 @@ namespace quda
 
   private:
     T *data;
-    const unsigned int size;  // number of elements of type T
 
     /**
        @brief This is a dummy instantiation for the host compiler
@@ -57,6 +56,9 @@ namespace quda
     }
 
   public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
     static constexpr unsigned int get_offset(dim3 block)
     {
       unsigned int o = 0;
@@ -64,6 +66,9 @@ namespace quda
       return o;
     }
 
+    /**
+       @brief Shared memory size in bytes.
+    */
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
       return get_offset(block) + S::size(block)*sizeof(T);
@@ -72,10 +77,12 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
-			       size(S::size(target::block_dim())) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}
 
-    constexpr auto smem() const { return *this; }
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
 
     /**
        @brief Subscripting operator returning a reference to element.

From 5889bc45c1bf2b4c8c51889410de48f7668adecb Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Fri, 1 Sep 2023 08:17:26 -0500
Subject: [PATCH 17/27] format

---
 include/targets/cuda/shared_memory_helper.h   | 12 +---
 include/targets/cuda/thread_array.h           |  5 +-
 include/targets/generic/helpers.h             | 24 +++----
 .../generic/shared_memory_cache_helper.h      | 35 +++++------
 include/targets/generic/thread_array.h        |  8 +--
 include/targets/generic/thread_local_cache.h  | 62 ++++++++++++-------
 include/targets/hip/shared_memory_helper.h    | 12 +---
 7 files changed, 74 insertions(+), 84 deletions(-)

diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
index 2da596f6ed..bc9bd7c66b 100644
--- a/include/targets/cuda/shared_memory_helper.h
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -50,10 +50,7 @@ namespace quda
       }
     };
 
-    __device__ __host__ inline T *cache(unsigned int offset) const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
+    __device__ __host__ inline T *cache(unsigned int offset) const { return target::dispatch<cache_dynamic>(offset); }
 
   public:
     /**
@@ -69,15 +66,12 @@ namespace quda
     /**
        @brief Shared memory size in bytes.
     */
-    static constexpr unsigned int shared_mem_size(dim3 block)
-    {
-      return get_offset(block) + S::size(block)*sizeof(T);
-    }
+    static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block) * sizeof(T); }
 
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) { }
 
     /**
        @brief Return this SharedMemory object.
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index c88a178e2c..8237fcb87d 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -10,7 +10,8 @@
 
 namespace quda
 {
-  template <typename T, int n> struct thread_array : array<T, n> {};
-}
+  template <typename T, int n> struct thread_array : array<T, n> {
+  };
+} // namespace quda
 
 #endif
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
index f8faf41b44..fcf673db8c 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/helpers.h
@@ -13,25 +13,22 @@ namespace quda
      @brief Used to declare an object of fixed size.
    */
   template <int N> struct SizeStatic {
-    static constexpr unsigned int size(dim3) {
-      return N;
-    }
+    static constexpr unsigned int size(dim3) { return N; }
   };
 
   /**
      @brief Used to declare an object of fixed size per thread, N.
    */
   template <int N> struct SizePerThread {
-    static constexpr unsigned int size(dim3 block) {
-      return N * block.x * block.y * block.z;
-    }
+    static constexpr unsigned int size(dim3 block) { return N * block.x * block.y * block.z; }
   };
 
   /**
      @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D.
    */
   template <typename D, int N = 1> struct SizeDims {
-    static constexpr unsigned int size(dim3 block) {
+    static constexpr unsigned int size(dim3 block)
+    {
       dim3 dims = D::dims(block);
       return dims.x * dims.y * dims.z * N;
     }
@@ -41,19 +38,14 @@ namespace quda
      @brief Used to declare an object with dimensions given by the block size.
    */
   struct DimsBlock {
-    static constexpr dim3 dims(dim3 block) {
-      return block;
-    }
+    static constexpr dim3 dims(dim3 block) { return block; }
   };
 
   /**
      @brief Used to declare an object with fixed dimensions.
    */
-  template <int x, int y, int z>
-  struct DimsStatic {
-    static constexpr dim3 dims(dim3) {
-      return dim3(x,y,z);
-    }
+  template <int x, int y, int z> struct DimsStatic {
+    static constexpr dim3 dims(dim3) { return dim3(x, y, z); }
   };
 
   /**
@@ -64,4 +56,4 @@ namespace quda
     using type = T;
   };
 
-}
+} // namespace quda
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index 1866375f76..3e4ad31302 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -29,9 +29,9 @@ namespace quda
      O::shared_mem_size(target::block_dim()) if O is not void.
    */
   template <typename T, typename D = DimsBlock, typename O = void>
-  class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>
+  class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D, sizeof(T) / sizeof(atom_t<T>)>, O>
   {
-    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using Smem = SharedMemory<atom_t<T>, SizeDims<D, sizeof(T) / sizeof(atom_t<T>)>, O>;
 
   public:
     using value_type = T;
@@ -50,7 +50,7 @@ namespace quda
     static constexpr int n_element = sizeof(T) / sizeof(atom_t);
 
     // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
-    template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy,void>,T,void>;
+    template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy, void>, T, void>;
 
     __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
     {
@@ -61,8 +61,7 @@ namespace quda
       for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i];
     }
 
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_detail(int x, int y, int z) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_detail(int x, int y, int z) const
     {
       atom_t tmp[n_element];
       int j = (z * block.y + y) * block.x + x;
@@ -91,19 +90,17 @@ namespace quda
     /**
        @brief Constructor for SharedMemoryCache.
     */
-    constexpr SharedMemoryCache() :
-      block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
+    constexpr SharedMemoryCache() : block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
     {
       // sanity check
-      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims<D>::size(dim3{32,16,8})*sizeof(T));
+      static_assert(shared_mem_size(dim3 {32, 16, 8})
+                    == Smem::get_offset(dim3 {32, 16, 8}) + SizeDims<D>::size(dim3 {32, 16, 8}) * sizeof(T));
     }
 
     /**
        @brief Grab the raw base address to shared memory.
     */
-    __device__ __host__ inline auto data() const {
-      return reinterpret_cast<T *>(&sharedMem()[0]);
-    }
+    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(&sharedMem()[0]); }
 
     /**
        @brief Save the value into the 3-d shared memory cache.
@@ -179,8 +176,7 @@ namespace quda
        @param[in] x The x index to use
        @return The value at coordinates (x,y,z)
     */
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_x(int x = -1) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_x(int x = -1) const
     {
       auto tid = target::thread_idx();
       x = (x == -1) ? tid.x : x;
@@ -192,8 +188,7 @@ namespace quda
        @param[in] y The y index to use
        @return The value at coordinates (x,y,z)
     */
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_y(int y = -1) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_y(int y = -1) const
     {
       auto tid = target::thread_idx();
       y = (y == -1) ? tid.y : y;
@@ -205,8 +200,7 @@ namespace quda
        @param[in] z The z index to use
        @return The value at coordinates (x,y,z)
     */
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_z(int z = -1) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_z(int z = -1) const
     {
       auto tid = target::thread_idx();
       z = (z == -1) ? tid.z : z;
@@ -222,8 +216,7 @@ namespace quda
        @brief Cast operator to allow cache objects to be used where T
        is expected
      */
-    template <typename dummy = void>
-    __device__ __host__ operator maybeT<dummy>() const { return load(); }
+    template <typename dummy = void> __device__ __host__ operator maybeT<dummy>() const { return load(); }
 
     /**
        @brief Assignment operator to allow cache objects to be used on
@@ -281,7 +274,9 @@ namespace quda
      with an instance of T or SharedMemoryCache<T,D,O>
    */
   template <class T>
-  struct get_type<T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
+  struct get_type<T,
+                  std::enable_if_t<std::is_same_v<
+                    T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index 5325e5ab8f..0e641a11df 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -13,16 +13,14 @@ namespace quda
      The offset into the shared memory region is determined from the
      type O.
    */
-  template <typename T, int n, typename O = void>
-  class thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O>
+  template <typename T, int n, typename O = void> class thread_array : SharedMemory<array<T, n>, SizePerThread<1>, O>
   {
-    using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
+    using Smem = SharedMemory<array<T, n>, SizePerThread<1>, O>;
     using Smem::sharedMem;
     array<T, n> &array_;
 
   public:
-    __device__ __host__ constexpr thread_array() :
-      array_(sharedMem()[target::thread_idx_linear<3>()])
+    __device__ __host__ constexpr thread_array() : array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       array_ = array<T, n>(); // call default constructor
     }
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index 277b16903c..7d33ad40e0 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -19,16 +19,16 @@ namespace quda
      or array of values (for N_ > 0), which can use shared memory for
      optimization purposes.
    */
-  template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache :
-    SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>
+  template <typename T, int N_ = 0, typename O = void>
+  class ThreadLocalCache : SharedMemory<atom_t<T>, SizePerThread<std::max(1, N_) * sizeof(T) / sizeof(atom_t<T>)>, O>
   {
-    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1, N_) * sizeof(T) / sizeof(atom_t<T>)>, O>;
 
   public:
     using value_type = T;
     static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
-    static constexpr int len = std::max(1,N); // actual number of elements to store
+    using offset_type = O;       // type of object using shared memory at the same time that is located before this one
+    static constexpr int len = std::max(1, N); // actual number of elements to store
     using Smem::shared_mem_size;
 
   private:
@@ -46,7 +46,7 @@ namespace quda
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) sharedMem()[(k*n_element + i) * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) sharedMem()[(k * n_element + i) * stride + j] = tmp[i];
     }
 
     __device__ __host__ inline T load_detail(const int k) const
@@ -54,7 +54,7 @@ namespace quda
       atom_t tmp[n_element];
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k*n_element + i) * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k * n_element + i) * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -64,16 +64,19 @@ namespace quda
     /**
        @brief Constructor for ThreadLocalCache.
     */
-    constexpr ThreadLocalCache() : stride(target::block_size<3>()) {
+    constexpr ThreadLocalCache() : stride(target::block_size<3>())
+    {
       // sanity check
-      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizePerThread<len>::size(dim3{32,16,8})*sizeof(T));
+      static_assert(shared_mem_size(dim3 {32, 16, 8})
+                    == Smem::get_offset(dim3 {32, 16, 8}) + SizePerThread<len>::size(dim3 {32, 16, 8}) * sizeof(T));
     }
 
     /**
        @brief Save the value into the thread local cache.  Used when N==0 so cache acts like single object.
        @param[in] a The value to store in the thread local cache
      */
-    __device__ __host__ inline void save(const T &a) const {
+    __device__ __host__ inline void save(const T &a) const
+    {
       static_assert(N == 0);
       save_detail(a, 0);
     }
@@ -83,7 +86,8 @@ namespace quda
        @param[in] a The value to store in the thread local cache
        @param[in] k The index to use
      */
-    __device__ __host__ inline void save(const T &a, const int k) const {
+    __device__ __host__ inline void save(const T &a, const int k) const
+    {
       static_assert(N > 0);
       save_detail(a, k);
     }
@@ -92,7 +96,8 @@ namespace quda
        @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
        @return The value stored in the thread local cache
      */
-    __device__ __host__ inline T load() const {
+    __device__ __host__ inline T load() const
+    {
       static_assert(N == 0);
       return load_detail(0);
     }
@@ -102,7 +107,8 @@ namespace quda
        @param[in] k The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ inline T load(const int k) const {
+    __device__ __host__ inline T load(const int k) const
+    {
       static_assert(N > 0);
       return load_detail(k);
     }
@@ -110,7 +116,8 @@ namespace quda
     /**
        @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
      */
-    __device__ __host__ operator T() const {
+    __device__ __host__ operator T() const
+    {
       static_assert(N == 0);
       return load();
     }
@@ -119,7 +126,8 @@ namespace quda
        @brief Assignment operator to allow cache objects to be used on
        the lhs where T is otherwise expected (when N==0).
      */
-    __device__ __host__ void operator=(const T &src) const {
+    __device__ __host__ void operator=(const T &src) const
+    {
       static_assert(N == 0);
       save(src);
     }
@@ -129,39 +137,46 @@ namespace quda
        @param[in] i The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ T operator[](int i) {
+    __device__ __host__ T operator[](int i)
+    {
       static_assert(N > 0);
       return load(i);
     }
   };
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
   {
     return static_cast<const T &>(a) + b;
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, N, O> &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, N, O> &b)
   {
     return a + static_cast<const T &>(b);
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator-(const ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator-(const ThreadLocalCache<T, N, O> &a, const T &b)
   {
     return static_cast<const T &>(a) - b;
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, N, O> &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, N, O> &b)
   {
     return a - static_cast<const T &>(b);
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline auto operator+=(ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline auto operator+=(ThreadLocalCache<T, N, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) + b);
     return a;
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline auto operator-=(ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline auto operator-=(ThreadLocalCache<T, N, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) - b);
     return a;
@@ -177,7 +192,8 @@ namespace quda
      with an instance of T or ThreadLocalCache<T,N,O>
    */
   template <class T>
-  struct get_type<T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
+  struct get_type<
+    T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index 2da596f6ed..bc9bd7c66b 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -50,10 +50,7 @@ namespace quda
       }
     };
 
-    __device__ __host__ inline T *cache(unsigned int offset) const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
+    __device__ __host__ inline T *cache(unsigned int offset) const { return target::dispatch<cache_dynamic>(offset); }
 
   public:
     /**
@@ -69,15 +66,12 @@ namespace quda
     /**
        @brief Shared memory size in bytes.
     */
-    static constexpr unsigned int shared_mem_size(dim3 block)
-    {
-      return get_offset(block) + S::size(block)*sizeof(T);
-    }
+    static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block) * sizeof(T); }
 
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) { }
 
     /**
        @brief Return this SharedMemory object.

From 9f7074f3103cee6afdcbf87055b77ad72b39ab33 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 7 Sep 2023 08:50:36 -0700
Subject: [PATCH 18/27] Fix some issues TuneParam::shared_bytes settings with
 some of the tuning classes

---
 include/tunable_block_reduction.h |  8 +++++++-
 include/tunable_nd.h              | 19 ++++++-------------
 include/tunable_reduction.h       | 13 ++++++++++++-
 include/tune_quda.h               | 19 +++++++++++--------
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h
index f4552605e5..8c9af36d5a 100644
--- a/include/tunable_block_reduction.h
+++ b/include/tunable_block_reduction.h
@@ -162,12 +162,18 @@ namespace quda
         return true;
       } else { // block.x (spacetime) was reset
 
+        auto next = param;
+        next.block.z += step_z;
+        auto shared_bytes = setSharedBytes(next);
+
         // we can advance spin/block-color since this is valid
         if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + step_z) <= device::max_threads_per_block()
-            && ((param.block.z + step_z) <= max_block_z)) {
+            && ((param.block.z + step_z) <= max_block_z)
+            && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z += step_z;
           param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
+          param.shared_bytes = shared_bytes;
           return true;
         } else { // we have run off the end so let's reset
           param.block.z = step_z;
diff --git a/include/tunable_nd.h b/include/tunable_nd.h
index 43af42f932..b942bc041d 100644
--- a/include/tunable_nd.h
+++ b/include/tunable_nd.h
@@ -265,8 +265,7 @@ namespace quda
 
         auto next = param;
         next.block.y += step_y;
-        auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z,
-                                     this->sharedBytesPerBlock(next));
+        auto shared_bytes = this->setSharedBytes(next);
 
         // we can advance spin/block-color since this is valid
         if (param.block.y < vector_length_y && param.block.y < device::max_threads_per_block_dim(1)
@@ -279,7 +278,6 @@ namespace quda
         } else { // we have run off the end so let's reset
           param.block.y = step_y;
           param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
-
           return false;
         }
       }
@@ -294,8 +292,7 @@ namespace quda
       Tunable::initTuneParam(param);
       param.block.y = step_y;
       param.grid.y = (vector_length_y + step_y - 1) / step_y;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -307,8 +304,7 @@ namespace quda
       Tunable::defaultTuneParam(param);
       param.block.y = step_y;
       param.grid.y = (vector_length_y + step_y - 1) / step_y;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -524,8 +520,7 @@ namespace quda
 
         auto next = param;
         next.block.z += step_z;
-        auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z,
-                                     this->sharedBytesPerBlock(next));
+        auto shared_bytes = this->setSharedBytes(next);
 
         // we can advance spin/block-color since this is valid
         if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2)
@@ -552,8 +547,7 @@ namespace quda
       TunableKernel2D_base<grid_stride>::initTuneParam(param);
       param.block.z = step_z;
       param.grid.z = (vector_length_z + step_z - 1) / step_z;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -565,8 +559,7 @@ namespace quda
       TunableKernel2D_base<grid_stride>::defaultTuneParam(param);
       param.block.z = step_z;
       param.grid.z = (vector_length_z + step_z - 1) / step_z;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h
index d24904b2fd..a18030fd93 100644
--- a/include/tunable_reduction.h
+++ b/include/tunable_reduction.h
@@ -166,6 +166,7 @@ namespace quda
     {
       TunableKernel::initTuneParam(param);
       param.block.y = block_size_y;
+      setSharedBytes(param);
     }
 
     /**
@@ -176,6 +177,7 @@ namespace quda
     {
       TunableKernel::defaultTuneParam(param);
       param.block.y = block_size_y;
+      setSharedBytes(param);
     }
   };
 
@@ -323,11 +325,18 @@ namespace quda
       if (rtn) {
         return true;
       } else {
+
+        auto next = param;
+        next.block.z++;
+        auto shared_bytes = setSharedBytes(next);
+
         if (param.block.z < n_batch && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + 1) <= device::max_threads_per_block()
-            && param.block.z < n_batch_block_max) {
+            && param.block.z < n_batch_block_max
+            && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z++;
           param.grid.z = (n_batch + param.block.z - 1) / param.block.z;
+          param.shared_bytes = shared_bytes;
           return true;
         } else { // we have run off the end so let's reset
           param.block.z = 1;
@@ -346,6 +355,7 @@ namespace quda
       TunableReduction2D::initTuneParam(param);
       param.block = {param.block.x, param.block.y, 1};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
+      setSharedBytes(param);
     }
 
     /**
@@ -357,6 +367,7 @@ namespace quda
       TunableReduction2D::defaultTuneParam(param);
       param.block = {param.block.x, param.block.y, 1};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
+      setSharedBytes(param);
     }
   };
 
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 1511f6f881..9f88bbd6f9 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -150,6 +150,13 @@ namespace quda {
       }
     }
 
+    auto setSharedBytes(TuneParam &param) const
+    {
+      int nthreads = param.block.x * param.block.y * param.block.z;
+      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      return param.shared_bytes;
+    }
+
     virtual bool advanceBlockDim(TuneParam &param) const
     {
       const unsigned int max_threads = maxBlockSize(param);
@@ -157,14 +164,12 @@ namespace quda {
       bool ret;
 
       param.block.x += blockStep();
-      int nthreads = param.block.x * param.block.y * param.block.z;
-      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      setSharedBytes(param);
 
       if (param.block.x > max_threads || param.shared_bytes > max_shared
           || param.block.x * param.block.y * param.block.z > device::max_threads_per_block()) {
         resetBlockDim(param);
-        int nthreads = param.block.x * param.block.y * param.block.z;
-        param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+        setSharedBytes(param);
         ret = false;
       } else {
         ret = true;
@@ -214,8 +219,7 @@ namespace quda {
 	if (param.shared_bytes > max_shared) {
 	  TuneParam next(param);
 	  advanceBlockDim(next); // to get next blockDim
-	  int nthreads = next.block.x * next.block.y * next.block.z;
-          param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(next));
+          param.shared_bytes = setSharedBytes(next);
           return false;
 	} else {
 	  return true;
@@ -325,8 +329,7 @@ namespace quda {
 
 	param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
       }
-      int nthreads = param.block.x*param.block.y*param.block.z;
-      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      setSharedBytes(param);
     }
 
     /** sets default values for when tuning is disabled */

From 12c5c9899197215a5ee49d75de6413aa84ff337f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 7 Sep 2023 09:03:50 -0700
Subject: [PATCH 19/27] Fix sharedBytesPerThread for kernels that utilize
 thread_array

---
 lib/clover_deriv_quda.cu           | 1 +
 lib/gauge_ape.cu                   | 1 +
 lib/gauge_field_strength_tensor.cu | 1 +
 lib/gauge_force.cu                 | 1 +
 lib/gauge_loop_trace.cu            | 7 ++++---
 lib/gauge_stout.cu                 | 3 ++-
 lib/gauge_wilson_flow.cu           | 3 ++-
 7 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu
index 34f121de93..08702f322e 100644
--- a/lib/clover_deriv_quda.cu
+++ b/lib/clover_deriv_quda.cu
@@ -12,6 +12,7 @@ namespace quda {
     double coeff;
     int parity;
     unsigned int minThreads() const { return gauge.LocalVolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     DerivativeClover(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, int parity) :
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index 5ace8e5a29..f4a8388f87 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -13,6 +13,7 @@ namespace quda {
     const GaugeField &in;
     const Float alpha;
     unsigned int minThreads() const { return in.LocalVolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
diff --git a/lib/gauge_field_strength_tensor.cu b/lib/gauge_field_strength_tensor.cu
index d0ec026881..d7a555b890 100644
--- a/lib/gauge_field_strength_tensor.cu
+++ b/lib/gauge_field_strength_tensor.cu
@@ -11,6 +11,7 @@ namespace quda
     GaugeField &f;
     const GaugeField &u;
     unsigned int minThreads() const { return f.VolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     Fmunu(const GaugeField &u, GaugeField &f) :
diff --git a/lib/gauge_force.cu b/lib/gauge_force.cu
index 2558dadcac..2c89045005 100644
--- a/lib/gauge_force.cu
+++ b/lib/gauge_force.cu
@@ -12,6 +12,7 @@ namespace quda {
     double epsilon;
     const paths<4> &p;
     unsigned int minThreads() const { return mom.VolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     ForceGauge(const GaugeField &u, GaugeField &mom, double epsilon, const paths<4> &p) :
diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu
index faaaa97d99..9ec8aec5a8 100644
--- a/lib/gauge_loop_trace.cu
+++ b/lib/gauge_loop_trace.cu
@@ -13,6 +13,7 @@ namespace quda {
     std::vector<reduce_t>& loop_traces;
     double factor;
     const paths<1> p;
+    unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for threda_array
 
   public:
     // max block size of 8 is arbitrary for now
@@ -31,14 +32,14 @@ namespace quda {
       apply(device::get_default_stream());
     }
 
-    void apply(const qudaStream_t &stream)
+    void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       GaugeLoopTraceArg<Float, nColor, recon> arg(u, factor, p);
       launch<GaugeLoop>(loop_traces, tp, stream, arg);
     }
 
-    long long flops() const
+    long long flops() const override
     {
       auto Nc = u.Ncolor();
       auto mat_mul_flops = 8ll * Nc * Nc * Nc - 2 * Nc * Nc;
@@ -46,7 +47,7 @@ namespace quda {
       return (p.count * mat_mul_flops + p.num_paths * (2 * Nc + 2)) * u.Volume();
     }
 
-    long long bytes() const {
+    long long bytes() const override {
       // links * one LatticeColorMatrix worth of data
       return p.count * u.Bytes() / 4;
     }
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index 6177ef8170..bbb676b214 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -19,7 +19,8 @@ namespace quda {
     unsigned int sharedBytesPerThread() const
     {
       // use SharedMemoryCache if using over improvement for two link fields
-      return improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0;
+      return (improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0)
+        + 4 * sizeof(int); // for thread_array
     }
 
   public:
diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu
index a3ce38ba81..32768bb05a 100644
--- a/lib/gauge_wilson_flow.cu
+++ b/lib/gauge_wilson_flow.cu
@@ -25,7 +25,8 @@ namespace quda {
     unsigned int sharedBytesPerThread() const
     {
       // use SharedMemoryCache if using Symanzik improvement for two Link fields
-      return wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0;
+      return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0)
+        + 4 * sizeof(int); // for thread_array
     }
 
   public:

From 5ce1230208e2f7e5295eab3e13f9a074443e92c7 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 7 Sep 2023 17:08:35 -0500
Subject: [PATCH 20/27] fix overlapping shared mem

---
 include/kernels/gauge_stout.cuh        | 3 ++-
 include/kernels/gauge_utils.cuh        | 2 ++
 include/kernels/gauge_wilson_flow.cuh  | 3 ++-
 include/targets/cuda/thread_array.h    | 1 +
 include/targets/generic/thread_array.h | 2 ++
 5 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 4577e66fcd..712f191f83 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -135,7 +135,8 @@ namespace quda
       }
 
       Link U, Q;
-      ThreadLocalCache<Link> Stap;
+      //ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,computeStapleRectangleOps> Stap;
       ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh
index 48c7e6c1cc..ded8c9377a 100644
--- a/include/kernels/gauge_utils.cuh
+++ b/include/kernels/gauge_utils.cuh
@@ -19,6 +19,7 @@ namespace quda
   // matrix+matrix = 18 floating-point ops
   // => Total number of floating point ops per function call
   // dims * (2*18 + 4*198) = dims*828
+  using computeStapleOps = thread_array<int, 4>;
   template <typename Arg, typename Staple, typename Int>
   __host__ __device__ inline void computeStaple(const Arg &arg, const int *x, const Int *X, const int parity, const int nu, Staple &staple, const int dir_ignore)
   {
@@ -94,6 +95,7 @@ namespace quda
   // matrix+matrix = 18 floating-point ops
   // => Total number of floating point ops per function call
   // dims * (8*18 + 28*198) = dims*5688
+  using computeStapleRectangleOps = thread_array<int, 4>;
   template <typename Arg, typename Staple, typename Rectangle, typename Int>
   __host__ __device__ inline void computeStapleRectangle(const Arg &arg, const int *x, const Int *X, const int parity, const int nu,
                                                          Staple &staple, Rectangle &rectangle, const int dir_ignore)
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index ae28956112..22864ce1b0 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -72,7 +72,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      ThreadLocalCache<Link> Stap;
+      //ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,computeStapleRectangleOps> Stap;
       ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index 8237fcb87d..1c4d7f3244 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -11,6 +11,7 @@
 namespace quda
 {
   template <typename T, int n> struct thread_array : array<T, n> {
+    static constexpr unsigned int shared_mem_size(dim3 block) { return 0; }
   };
 } // namespace quda
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index 0e641a11df..d513394cfc 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -20,6 +20,8 @@ namespace quda
     array<T, n> &array_;
 
   public:
+    using Smem::shared_mem_size;
+
     __device__ __host__ constexpr thread_array() : array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       array_ = array<T, n>(); // call default constructor

From 56e0ee68c3c34c4d02b8032451435ffdf0bd2ed6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 19 Sep 2023 22:38:00 -0700
Subject: [PATCH 21/27] Bug fix for VUV/VLV kernels now that dynamic shared
 memory is used

---
 lib/coarse_op.cuh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
index 4c23f50781..e8b42cd2f3 100644
--- a/lib/coarse_op.cuh
+++ b/lib/coarse_op.cuh
@@ -245,7 +245,7 @@ namespace quda {
 
     unsigned int sharedBytesPerBlock(const TuneParam &param) const override
     {
-      if (type == COMPUTE_VUV || type == COMPUTE_VLV)
+      if ((type == COMPUTE_VUV || type == COMPUTE_VLV) && arg.shared_atomic)
         return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin;
       return TunableKernel3D::sharedBytesPerBlock(param);
     }
@@ -577,9 +577,7 @@ namespace quda {
       if (type == COMPUTE_VUV || type == COMPUTE_VLV || type == COMPUTE_CONVERT || type == COMPUTE_RESCALE) arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
       arg.kd_dagger = kd_dagger;
 
-      if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes -= sharedBytesPerBlock(tp); // shared memory is static so don't include it in launch
       Launch<location_template>(arg, tp, type, stream);
-      if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes += sharedBytesPerBlock(tp); // restore shared memory
     };
 
     /**

From c2d336c77ca901e57d5e9fb983e64268641ee7f7 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 19 Sep 2023 23:25:52 -0700
Subject: [PATCH 22/27] Add some shared memory checks when launching kernels

---
 include/targets/cuda/tunable_kernel.h |  3 +++
 include/targets/hip/tunable_kernel.h  |  3 +++
 include/tune_quda.h                   | 16 ++++++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/include/targets/cuda/tunable_kernel.h b/include/targets/cuda/tunable_kernel.h
index d7936eb497..7306ab355c 100644
--- a/include/targets/cuda/tunable_kernel.h
+++ b/include/targets/cuda/tunable_kernel.h
@@ -45,6 +45,7 @@ namespace quda
     std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
 #ifdef JITIFY
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
 #else
@@ -62,6 +63,7 @@ namespace quda
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
 #ifdef JITIFY
       // note we do the copy to constant memory after the kernel has been compiled in launch_jitify
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
@@ -83,6 +85,7 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
+      checkSharedBytes(tp);
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/targets/hip/tunable_kernel.h b/include/targets/hip/tunable_kernel.h
index 4084058066..dca9dcfa24 100644
--- a/include/targets/hip/tunable_kernel.h
+++ b/include/targets/hip/tunable_kernel.h
@@ -42,6 +42,7 @@ namespace quda
     std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
       launch_error = qudaLaunchKernel(kernel.func, tp, stream, static_cast<const void *>(&arg));
       return launch_error;
     }
@@ -50,6 +51,7 @@ namespace quda
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
       static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
       qudaMemcpyAsync(device::get_constant_buffer<Arg>(), &arg, sizeof(Arg), qudaMemcpyHostToDevice, stream);
       launch_error = qudaLaunchKernel(kernel.func, tp, stream, static_cast<const void *>(&arg));
@@ -66,6 +68,7 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
+      checkSharedBytes(tp);
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 9f88bbd6f9..6851b177da 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -379,6 +379,22 @@ namespace quda {
         errorQuda("aux tuning enabled but param.aux is not initialized");
     }
 
+    /**
+     * @brief self-consistency check that the shared memory is set
+     * correctly (e.g., check that block size has been correctly
+     * factored in when set setting shared_bytes)
+    */
+    void checkSharedBytes(const TuneParam &tp) const {
+      auto tp2 = TuneParam(tp);
+      auto expected = setSharedBytes(tp2);
+      if (tp.shared_bytes < expected)
+        errorQuda("Shared bytes %u insufficient (expected %u)", tp.shared_bytes, expected);
+
+      if (sharedBytesPerThread() && sharedBytesPerBlock(tp))
+        errorQuda("Not supported: non-zero shared bytes per thread (%u) and per block (%u)",
+                  sharedBytesPerThread(), sharedBytesPerBlock(tp));
+    }
+
     /**
      * @brief Return the rank on which kernel tuning is performed.
      * This will default to 0, but can be globally overriden with the

From 2048bd12a7470a184762f46aae5a1614e4938459 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 30 Nov 2023 13:06:13 -0600
Subject: [PATCH 23/27] fix some shared bytes amounts

---
 lib/coarse_op.cuh           | 2 +-
 lib/dslash5_domain_wall.cu  | 5 +++--
 lib/dslash_clover_helper.cu | 6 +++++-
 lib/gauge_stout.cu          | 2 +-
 lib/gauge_wilson_flow.cu    | 2 +-
 5 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
index bcb9e781b6..78d5cd716e 100644
--- a/lib/coarse_op.cuh
+++ b/lib/coarse_op.cuh
@@ -245,7 +245,7 @@ namespace quda {
 
     unsigned int sharedBytesPerBlock(const TuneParam &param) const override
     {
-      if ((type == COMPUTE_VUV || type == COMPUTE_VLV) && arg.shared_atomic)
+      if (arg.shared_atomic && (type == COMPUTE_VUV || type == COMPUTE_VLV))
         return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin;
       return TunableKernel3D::sharedBytesPerBlock(param);
     }
diff --git a/lib/dslash5_domain_wall.cu b/lib/dslash5_domain_wall.cu
index 10e6ae8a48..f16b6cd7ab 100644
--- a/lib/dslash5_domain_wall.cu
+++ b/lib/dslash5_domain_wall.cu
@@ -72,7 +72,8 @@ namespace quda
           && (type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS
               || type == Dslash5Type::M5_INV_ZMOBIUS)) {
         // spin components in shared depend on inversion algorithm
-        int nSpin = mobius_m5::var_inverse() ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() : in.Nspin();
+	bool isInv = type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS || type == Dslash5Type::M5_INV_ZMOBIUS;
+        int nSpin = (!isInv || mobius_m5::var_inverse()) ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() : in.Nspin();
         return 2 * nSpin * nColor * sizeof(typename mapper<Float>::type);
       } else {
         return 0;
@@ -129,7 +130,7 @@ namespace quda
     }
 
     template <bool dagger, bool xpay, Dslash5Type type> using Arg = Dslash5Arg<Float, nColor, dagger, xpay, type>;
-    
+
     template <Dslash5Type type, template <typename> class F>
     void Launch(TuneParam &tp, const qudaStream_t &stream)
     {
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index fa2ba4d365..3d6db85853 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -76,7 +76,11 @@ namespace quda {
 
     unsigned int sharedBytesPerThread() const
     {
-      return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
+      if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
+	return 0;
+      } else {
+	return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
+      }
     }
 
   public:
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index 006ee94d0c..48d7e638e8 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -18,7 +18,7 @@ namespace quda {
     unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }
     unsigned int sharedBytesPerThread() const
     {
-      // use SharedMemoryCache if using over improvement for two link fields
+      // use ThreadLocalCache if using over improvement for two link fields
       return (improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0)
         + 4 * sizeof(int); // for thread_array
     }
diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu
index c38abf9ec6..78456b665f 100644
--- a/lib/gauge_wilson_flow.cu
+++ b/lib/gauge_wilson_flow.cu
@@ -24,7 +24,7 @@ namespace quda {
 
     unsigned int sharedBytesPerThread() const
     {
-      // use SharedMemoryCache if using Symanzik improvement for two Link fields
+      // use ThreadLocalCache if using Symanzik improvement for two Link fields
       return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0)
         + 4 * sizeof(int); // for thread_array
     }

From d5e73b69b0eb5f086f0a551f6e0240d990b056ac Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 30 Nov 2023 16:08:18 -0600
Subject: [PATCH 24/27] move some code to address comments

---
 include/kernels/gauge_wilson_flow.cuh                | 1 -
 include/targets/cuda/load_store.h                    | 6 ++++++
 include/targets/generic/{helpers.h => kernel_ops.h}  | 6 ------
 include/targets/generic/shared_memory_cache_helper.h | 2 +-
 include/targets/generic/thread_array.h               | 2 +-
 include/targets/generic/thread_local_cache.h         | 2 +-
 include/targets/hip/load_store.h                     | 6 ++++++
 7 files changed, 15 insertions(+), 10 deletions(-)
 rename include/targets/generic/{helpers.h => kernel_ops.h} (86%)

diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 22864ce1b0..1c49eeaf83 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -72,7 +72,6 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      //ThreadLocalCache<Link> Stap;
       ThreadLocalCache<Link,0,computeStapleRectangleOps> Stap;
       ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 4a5420b166..0550ad62dd 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -6,6 +6,12 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/kernel_ops.h
similarity index 86%
rename from include/targets/generic/helpers.h
rename to include/targets/generic/kernel_ops.h
index fcf673db8c..d8e9cba9bd 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/kernel_ops.h
@@ -3,12 +3,6 @@
 namespace quda
 {
 
-  /**
-     @brief Element type used for coalesced storage.
-   */
-  template <typename T>
-  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-
   /**
      @brief Used to declare an object of fixed size.
    */
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index 3e4ad31302..c4b466fb57 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <helpers.h>
+#include <kernel_ops.h>
 #include <target_device.h>
 #include <shared_memory_helper.h>
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index d513394cfc..dbfc6a4dc5 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <helpers.h>
+#include <kernel_ops.h>
 #include <shared_memory_helper.h>
 #include <array.h>
 
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index 7d33ad40e0..3400acca05 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <helpers.h>
+#include <kernel_ops.h>
 #include <target_device.h>
 #include <shared_memory_helper.h>
 
diff --git a/include/targets/hip/load_store.h b/include/targets/hip/load_store.h
index 7b387bf2df..d1bfe4a955 100644
--- a/include/targets/hip/load_store.h
+++ b/include/targets/hip/load_store.h
@@ -5,6 +5,12 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 

From 13075c7bd787358cfda507de65cfb588ce5b435b Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 30 Nov 2023 16:39:21 -0600
Subject: [PATCH 25/27] fix typo

---
 include/kernels/gauge_stout.cuh | 1 -
 lib/gauge_loop_trace.cu         | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 712f191f83..0e92affa8b 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -135,7 +135,6 @@ namespace quda
       }
 
       Link U, Q;
-      //ThreadLocalCache<Link> Stap;
       ThreadLocalCache<Link,0,computeStapleRectangleOps> Stap;
       ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
 
diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu
index 69e4164c7d..6397da8252 100644
--- a/lib/gauge_loop_trace.cu
+++ b/lib/gauge_loop_trace.cu
@@ -13,7 +13,7 @@ namespace quda {
     std::vector<reduce_t>& loop_traces;
     double factor;
     const paths<1> p;
-    unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for threda_array
+    unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for thread_array
 
   public:
     // max block size of 8 is arbitrary for now

From fc3ea42eb55a31155cf67309ab01ffe854ceec3e Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 30 Nov 2023 17:52:02 -0600
Subject: [PATCH 26/27] clang format

---
 include/tunable_block_reduction.h | 3 +--
 include/tunable_reduction.h       | 3 +--
 include/tune_quda.h               | 9 +++++----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h
index 8c9af36d5a..fcd73e5a92 100644
--- a/include/tunable_block_reduction.h
+++ b/include/tunable_block_reduction.h
@@ -169,8 +169,7 @@ namespace quda
         // we can advance spin/block-color since this is valid
         if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + step_z) <= device::max_threads_per_block()
-            && ((param.block.z + step_z) <= max_block_z)
-            && shared_bytes <= this->maxSharedBytesPerBlock()) {
+            && ((param.block.z + step_z) <= max_block_z) && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z += step_z;
           param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
           param.shared_bytes = shared_bytes;
diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h
index a18030fd93..17241fcccc 100644
--- a/include/tunable_reduction.h
+++ b/include/tunable_reduction.h
@@ -332,8 +332,7 @@ namespace quda
 
         if (param.block.z < n_batch && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + 1) <= device::max_threads_per_block()
-            && param.block.z < n_batch_block_max
-            && shared_bytes <= this->maxSharedBytesPerBlock()) {
+            && param.block.z < n_batch_block_max && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z++;
           param.grid.z = (n_batch + param.block.z - 1) / param.block.z;
           param.shared_bytes = shared_bytes;
diff --git a/include/tune_quda.h b/include/tune_quda.h
index da7678cc0b..c2cc221d4e 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -342,16 +342,17 @@ namespace quda {
      * @brief self-consistency check that the shared memory is set
      * correctly (e.g., check that block size has been correctly
      * factored in when set setting shared_bytes)
-    */
-    void checkSharedBytes(const TuneParam &tp) const {
+     */
+    void checkSharedBytes(const TuneParam &tp) const
+    {
       auto tp2 = TuneParam(tp);
       auto expected = setSharedBytes(tp2);
       if (tp.shared_bytes < expected)
         errorQuda("Shared bytes %u insufficient (expected %u)", tp.shared_bytes, expected);
 
       if (sharedBytesPerThread() && sharedBytesPerBlock(tp))
-        errorQuda("Not supported: non-zero shared bytes per thread (%u) and per block (%u)",
-                  sharedBytesPerThread(), sharedBytesPerBlock(tp));
+        errorQuda("Not supported: non-zero shared bytes per thread (%u) and per block (%u)", sharedBytesPerThread(),
+                  sharedBytesPerBlock(tp));
     }
 
     /**

From bad098a02da4100fda6ed2743285b77286544b85 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 14 Dec 2023 17:28:33 -0800
Subject: [PATCH 27/27] Apply clang format

---
 include/kernels/block_transpose.cuh   |  9 +++++----
 include/kernels/coarse_op_kernel.cuh  |  7 ++++---
 include/kernels/color_spinor_pack.cuh | 13 +++++++------
 include/kernels/gauge_stout.cuh       |  4 ++--
 include/kernels/gauge_wilson_flow.cuh |  4 ++--
 include/kernels/hisq_paths_force.cuh  | 18 +++++++++++-------
 lib/dslash5_domain_wall.cu            |  6 ++++--
 lib/dslash_clover_helper.cu           |  4 ++--
 lib/gauge_loop_trace.cu               |  3 ++-
 lib/gauge_wilson_flow.cu              |  4 +++-
 10 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
index 3c54345a6c..91aaec4d63 100644
--- a/include/kernels/block_transpose.cuh
+++ b/include/kernels/block_transpose.cuh
@@ -48,10 +48,11 @@ namespace quda
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     struct CacheDims {
-      static constexpr dim3 dims(dim3 block) {
-	block.x += 1;
-	block.z = 1;
-	return block;
+      static constexpr dim3 dims(dim3 block)
+      {
+        block.x += 1;
+        block.z = 1;
+        return block;
       }
     };
 
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index 04c05fda4e..03f9d4b75e 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -1388,9 +1388,10 @@ namespace quda {
   };
 
   template <> struct storeCoarseSharedAtomic_impl<true> {
-    template <typename Arg> using CacheT =
-      complex<storeType>[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-    template <typename Arg> using Cache = SharedMemoryCache<CacheT<Arg>,DimsStatic<2,1,1>>;
+    template <typename Arg>
+    using CacheT = complex<storeType>[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4]
+                                     [Arg::coarseSpin][Arg::coarseSpin];
+    template <typename Arg> using Cache = SharedMemoryCache<CacheT<Arg>, DimsStatic<2, 1, 1>>;
 
     template <typename VUV, typename Pack, typename Arg>
     inline __device__ void operator()(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity, int i0, int j0, int parity, const Pack &pack, const Arg &arg)
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index a67d86ded2..38100fff50 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -175,12 +175,13 @@ namespace quda {
     template <typename Arg> struct CacheDims {
       static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
       static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
-      static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
-      static constexpr dim3 dims(dim3 block) {
-	// pad the shared block size to avoid bank conflicts for native ordering
-	if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-	block.y = color_spin_threads; // state the y block since we know it at compile time
-	return block;
+      static constexpr int color_spin_threads = (Arg::nSpin / Ms) * (Arg::nColor / Mc);
+      static constexpr dim3 dims(dim3 block)
+      {
+        // pad the shared block size to avoid bank conflicts for native ordering
+        if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
+        block.y = color_spin_threads; // state the y block since we know it at compile time
+        return block;
       }
     };
 
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 0e92affa8b..2a512e77e3 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -135,8 +135,8 @@ namespace quda
       }
 
       Link U, Q;
-      ThreadLocalCache<Link,0,computeStapleRectangleOps> Stap;
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link, 0, computeStapleRectangleOps> Stap;
+      ThreadLocalCache<Link, 0, decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 1c49eeaf83..457d93beab 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -72,8 +72,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      ThreadLocalCache<Link,0,computeStapleRectangleOps> Stap;
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link, 0, computeStapleRectangleOps> Stap;
+      ThreadLocalCache<Link, 0, decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       break;
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index 35ddde688d..a16f7783eb 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -387,7 +387,9 @@ namespace quda {
             2 multiplies, 1 add, 1 rescale
       */
       template <typename LinkCache>
-      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu, LinkCache &Uab_cache) {
+      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu,
+                                                   LinkCache &Uab_cache)
+      {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -707,7 +709,8 @@ namespace quda {
             4 multiplies, 2 adds, 2 rescales
       */
       template <typename LinkCache>
-      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
+      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         auto mycoeff_seven = parity_sign<typename Arg::real>(parity_a) * coeff_sign<sig_positive, typename Arg::real>(parity_a) * arg.coeff_seven;
 
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
@@ -801,7 +804,6 @@ namespace quda {
           force_sig = mm_add(mycoeff_seven * Oz, Od * Uda, force_sig);
           Matrix_cache.save(force_sig, 2);
         }
-
       }
 
       /**
@@ -820,7 +822,8 @@ namespace quda {
             2 multiplies, 2 adds, 2 rescales
       */
       template <typename LinkCache>
-      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
+      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         int y[4] = {x[0], x[1], x[2], x[3]};
         int point_h = updateCoordExtendedIndexShiftMILC<flip_dir(nu_positive)>(y, arg.nu, arg);
         int parity_h = 1 - parity_a;
@@ -873,7 +876,8 @@ namespace quda {
             1 multiply, 1 add, 1 rescale
       */
       template <typename LinkCache>
-      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
+      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -959,8 +963,8 @@ namespace quda {
         int parity_a = parity;
 
         // calculate p5_sig
-	constexpr int cacheLen = sig_positive ? 3 : 2;
-        ThreadLocalCache<Link,cacheLen> Matrix_cache;
+        constexpr int cacheLen = sig_positive ? 3 : 2;
+        ThreadLocalCache<Link, cacheLen> Matrix_cache;
 
         if constexpr (sig_positive) {
           Link force_sig = arg.force(arg.sig, point_a, parity_a);
diff --git a/lib/dslash5_domain_wall.cu b/lib/dslash5_domain_wall.cu
index f16b6cd7ab..6df3595613 100644
--- a/lib/dslash5_domain_wall.cu
+++ b/lib/dslash5_domain_wall.cu
@@ -72,8 +72,10 @@ namespace quda
           && (type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS
               || type == Dslash5Type::M5_INV_ZMOBIUS)) {
         // spin components in shared depend on inversion algorithm
-	bool isInv = type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS || type == Dslash5Type::M5_INV_ZMOBIUS;
-        int nSpin = (!isInv || mobius_m5::var_inverse()) ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() : in.Nspin();
+        bool isInv = type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS
+          || type == Dslash5Type::M5_INV_ZMOBIUS;
+        int nSpin = (!isInv || mobius_m5::var_inverse()) ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() :
+                                                           in.Nspin();
         return 2 * nSpin * nColor * sizeof(typename mapper<Float>::type);
       } else {
         return 0;
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index 3d6db85853..d2acdf369f 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -77,9 +77,9 @@ namespace quda {
     unsigned int sharedBytesPerThread() const
     {
       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
-	return 0;
+        return 0;
       } else {
-	return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
+        return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
       }
     }
 
diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu
index 6397da8252..6425e39cf8 100644
--- a/lib/gauge_loop_trace.cu
+++ b/lib/gauge_loop_trace.cu
@@ -47,7 +47,8 @@ namespace quda {
       return (p.count * mat_mul_flops + p.num_paths * (2 * Nc + 2)) * u.Volume();
     }
 
-    long long bytes() const override {
+    long long bytes() const override
+    {
       // links * one LatticeColorMatrix worth of data
       return p.count * u.Bytes() / 4;
     }
diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu
index 78456b665f..307d52472b 100644
--- a/lib/gauge_wilson_flow.cu
+++ b/lib/gauge_wilson_flow.cu
@@ -25,7 +25,9 @@ namespace quda {
     unsigned int sharedBytesPerThread() const
     {
       // use ThreadLocalCache if using Symanzik improvement for two Link fields
-      return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0)
+      return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ?
+                2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) :
+                0)
         + 4 * sizeof(int); // for thread_array
     }