diff --git a/.gitignore b/.gitignore
index a60ac7da48..8e122f0a4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,4 @@ include/jitify_options.hpp
 .tags*
 autom4te.cache/*
 .vscode
+cmake/CPM_*.cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 803f5dba41..512a7b9baa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,12 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32)
   message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.")
 endif()
 
+# For now only we only support register tiles for the staggered dslash operators
+set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels (staggered only)")
+if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS)
+  message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS")
+endif()
+
 set(QUDA_PRECISION
   "14"
   CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)")
@@ -275,6 +281,7 @@ mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F)
 
 mark_as_advanced(QUDA_MAX_MULTI_BLAS_N)
 mark_as_advanced(QUDA_MAX_MULTI_RHS)
+mark_as_advanced(QUDA_MAX_MULTI_RHS_TILE)
 mark_as_advanced(QUDA_PRECISION)
 mark_as_advanced(QUDA_RECONSTRUCT)
 mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
@@ -420,20 +427,12 @@ if(QUDA_DOWNLOAD_EIGEN)
   CPMAddPackage(
     NAME Eigen
     VERSION ${QUDA_EIGEN_VERSION}
-    URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2
+    URL https://gitlab.com/libeigen/eigen/-/archive/e67c494cba7180066e73b9f6234d0b2129f1cdf5.tar.bz2
+    URL_HASH SHA256=98d244932291506b75c4ae7459af29b1112ea3d2f04660686a925d9ef6634583
     DOWNLOAD_ONLY YES
     SYSTEM YES)
   target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR})
   install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE)
-
-  # Eigen 3.4 needs to be patched on Neon with nvc++
-  if (${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
-    set(CMAKE_PATCH_EIGEN OFF CACHE BOOL "Internal use only; do not modify")
-    if (NOT CMAKE_PATCH_EIGEN)
-      execute_process(COMMAND patch -N "${Eigen_SOURCE_DIR}/Eigen/src/Core/arch/NEON/Complex.h" "${CMAKE_SOURCE_DIR}/cmake/eigen34_neon.diff")
-      set(CMAKE_PATCH_EIGEN ON CACHE BOOL "Internal use only; do not modify" FORCE)
-    endif()
-  endif()
 else()
   # fall back to using find_package
   find_package(Eigen QUIET)
diff --git a/cmake/eigen34_neon.diff b/cmake/eigen34_neon.diff
deleted file mode 100644
index 7190ca7fa4..0000000000
--- a/cmake/eigen34_neon.diff
+++ /dev/null
@@ -1,8 +0,0 @@
-21c21
-< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
----
-> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__
-393c393
-< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
----
-> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__
diff --git a/include/blas_3d.h b/include/blas_3d.h
new file mode 100644
index 0000000000..2616a8ac03
--- /dev/null
+++ b/include/blas_3d.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <quda_internal.h>
+#include <color_spinor_field.h>
+
+namespace quda
+{
+
+  namespace blas3d
+  {
+
+    // Local enum for the 3D copy type
+    enum class copyType { COPY_TO_3D, COPY_FROM_3D, SWAP_3D };
+
+    /**
+       @brief Extract / insert / swap a timeslice between a 4-d field and a 3-d field
+       @param[in] slice Which slice
+       @param[in] type Extracting a time slice (COPY_TO_3D) or
+       inserting a timeslice (COPY_FROM_3D) or swapping time slices
+       (SWAP_3D)
+       @param[in,out] x 3-d field
+       @param[in,out] y 4-d field
+    */
+    void copy(int slice, copyType type, ColorSpinorField &x, ColorSpinorField &y);
+
+    /**
+       @brief Swap the slice in two given fields
+       @param[in] slice The slice we wish to swap in the fields
+       @param[in,out] x Field whose slice we wish to swap
+       @param[in,out] y Field whose slice we wish to swap
+     */
+    void swap(int slice, ColorSpinorField &x, ColorSpinorField &y);
+
+    /**
+       @brief Compute a set of real-valued inner products <x, y>, where each inner
+       product is restricted to a timeslice.
+       @param[out] result Vector of spatial inner products
+       @param[in] x Left vector field
+       @param[in] y Right vector field
+     */
+    void reDotProduct(std::vector<double> &result, const ColorSpinorField &x, const ColorSpinorField &y);
+
+    /**
+       @brief Compute a set of complex-valued inner products <x, y>, where each inner
+       product is restricted to a timeslice.
+       @param[out] result Vector of spatial inner products
+       @param[in] x Left vector field
+       @param[in] y Right vector field
+     */
+    void cDotProduct(std::vector<Complex> &result, const ColorSpinorField &a, const ColorSpinorField &b);
+
+    /**
+       @brief Timeslice real-valued scaling of the field
+       @param[in] a Vector of scale factors (length = local temporary extent)
+       @param[in] x Field we we wish to scale
+     */
+    void ax(std::vector<double> &a, ColorSpinorField &x);
+
+    /**
+       @brief Timeslice real-valued axpby computation
+       @param[in] a Vector of scale factors (length = local temporary extent)
+       @param[in] x Input field
+       @param[in] b Vector of scale factors (length = local temporary extent)
+       @param[in,out] y Field we are updating
+     */
+    void axpby(const std::vector<double> &a, const ColorSpinorField &x, const std::vector<double> &b,
+               ColorSpinorField &y);
+
+    /**
+       @brief Timeslice complex-valued axpby computation
+       @param[in] a Vector of scale factors (length = local temporary extent)
+       @param[in] x Input field
+       @param[in] b Vector of scale factors (length = local temporary extent)
+       @param[in,out] y Field we are updating
+     */
+    void caxpby(const std::vector<Complex> &a, const ColorSpinorField &x, const std::vector<Complex> &b,
+                ColorSpinorField &y);
+
+  } // namespace blas3d
+} // namespace quda
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 18de570604..52106e24fa 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -280,6 +280,8 @@ namespace quda
       //! for deflation etc.
       if (is_composite) printfQuda("Number of elements = %d\n", composite_dim);
     }
+
+    void change_dim(int D, int d) { x[D] = d; }
   };
 
   struct DslashConstant;
@@ -1054,6 +1056,38 @@ namespace quda
   */
   void spinorDistanceReweight(ColorSpinorField &src, double alpha0, int t0);
 
+  /**
+     @brief Helper function for determining if the spin of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @return If spin is unique return the number of spins
+   */
+  inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b)
+  {
+    int nSpin = 0;
+    if (a.Nspin() == b.Nspin())
+      nSpin = a.Nspin();
+    else
+      errorQuda("Spin %d %d do not match (%s:%d in %s())", a.Nspin(), b.Nspin(), file, line, func);
+    return nSpin;
+  }
+
+  /**
+     @brief Helper function for determining if the spin of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @param[in] args List of additional fields to check spin on
+     @return If spins is unique return the number of spins
+   */
+  template <typename... Args>
+  inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b,
+                   const Args &...args)
+  {
+    return Spin_(func, file, line, a, b) & Spin_(func, file, line, a, args...);
+  }
+
+#define checkSpin(...) Spin_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
   /**
      @brief Helper function for determining if the preconditioning
      type of the fields is the same.
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index d21121565f..e8049bfb85 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -971,16 +971,19 @@ namespace quda
         norm_t *norm = nullptr;
         int norm_offset = 0;
         if constexpr (fixed) {
-          if constexpr (block_float) {
+          if constexpr (fixed && block_float && nColor == 3 && nSpin == 1 && nVec == 1) {
+            norm = v.norm;
+            norm_offset = parity * v.norm_offset + 4 * x_cb + 3;
+          } else if constexpr (block_float) {
             norm = v.norm;
-            norm_offset = v.norm_offset;
+            norm_offset = parity * v.norm_offset + x_cb;
           } else {
             scale = v.scale;
             scale_inv = v.scale_inv;
           }
         }
         return fieldorder_wrapper<Float, storeFloat, block_float, norm_t>(
-          v.v, accessor.index(parity, x_cb, s, c, n, volumeCB), scale, scale_inv, norm, parity * norm_offset + x_cb);
+          v.v, accessor.index(parity, x_cb, s, c, n, volumeCB), scale, scale_inv, norm, norm_offset);
       }
 
       /** Returns the number of field colors */
diff --git a/include/comm_key.h b/include/comm_key.h
index 6618068bb0..a9682ddc03 100644
--- a/include/comm_key.h
+++ b/include/comm_key.h
@@ -1,27 +1,33 @@
 #pragma once
 
+#include <array.h>
+
 namespace quda
 {
 
   struct CommKey {
 
     static constexpr int n_dim = 4;
+    array<int, n_dim> key = {0, 0, 0, 0};
 
-    int array[n_dim] = {0, 0, 0, 0};
+    constexpr int product() { return key[0] * key[1] * key[2] * key[3]; }
 
-    constexpr inline int product() { return array[0] * array[1] * array[2] * array[3]; }
+    constexpr int &operator[](int d) { return key[d]; }
 
-    constexpr inline int &operator[](int d) { return array[d]; }
+    constexpr const int &operator[](int d) const { return key[d]; }
 
-    constexpr inline const int &operator[](int d) const { return array[d]; }
+    constexpr auto data() { return key.data; }
 
-    constexpr inline int *data() { return array; }
+    constexpr auto data() const { return key.data; }
 
-    constexpr inline const int *data() const { return array; }
+    constexpr bool is_valid() const { return (key[0] > 0) && (key[1] > 0) && (key[2] > 0) && (key[3] > 0); }
 
-    constexpr inline bool is_valid() const
+    bool operator==(const CommKey &other) const
     {
-      return (array[0] > 0) && (array[1] > 0) && (array[2] > 0) && (array[3] > 0);
+      bool is_same = true;
+      for (auto i = 0; i < n_dim; i++)
+        if (key[i] != other.key[i]) is_same = false;
+      return is_same;
     }
   };
 
diff --git a/include/comm_quda.h b/include/comm_quda.h
index 4341cf58d9..8f37bcd032 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -50,12 +50,26 @@ namespace quda
   int comm_dim(int dim);
 
   /**
-     Return the coording of this process in the dimension dim
+     Return the global number of processes in the dimension dim
+     @param dim Dimension which we are querying
+     @return Length of process dimensions
+  */
+  int comm_dim_global(int dim);
+
+  /**
+     Return the coordinate of this process in the dimension dim
      @param dim Dimension which we are querying
      @return Coordinate of this process
   */
   int comm_coord(int dim);
 
+  /**
+     Return the global coordinates of this process in the dimension dim
+     @param dim Dimension which we are querying
+     @return Coordinate of this process
+  */
+  int comm_coord_global(int dim);
+
   /**
    * Declare a message handle for sending `nbytes` to the `rank` with `tag`.
    */
diff --git a/include/complex_quda.h b/include/complex_quda.h
index 18da63def5..1481dec40f 100644
--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -360,23 +360,19 @@ struct complex
   typedef ValueType value_type;
 
   // Constructors
-  __host__ __device__ inline complex<ValueType>(const ValueType &re = ValueType(), const ValueType &im = ValueType())
+  __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType())
   {
     real(re);
     imag(im);
     }
 
-  template <class X>
-    __host__ __device__
-    inline complex<ValueType>(const complex<X> & z)
+    template <class X> __host__ __device__ inline complex(const complex<X> &z)
     {
       real(z.real());
       imag(z.imag());
     }
 
-  template <class X>
-    __host__ __device__
-    inline complex<ValueType>(const std::complex<X> & z)
+    template <class X> __host__ __device__ inline complex(const std::complex<X> &z)
     {
       real(z.real());
       imag(z.imag());
@@ -436,12 +432,11 @@ struct complex
 template <> struct complex<float> : public float2 {
 public:
   typedef float value_type;
-  complex<float>() = default;
-  constexpr complex<float>(const float &re, const float &im = float()) : float2 {re, im} { }
+  complex() = default;
+  constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { }
 
   template <typename X>
-  constexpr complex<float>(const std::complex<X> &z) :
-    float2 {static_cast<float>(z.real()), static_cast<float>(z.imag())}
+  constexpr complex(const std::complex<X> &z) : float2 {static_cast<float>(z.real()), static_cast<float>(z.imag())}
   {
   }
 
@@ -500,16 +495,15 @@ template <> struct complex<float> : public float2 {
 template <> struct complex<double> : public double2 {
 public:
   typedef double value_type;
-  complex<double>() = default;
-  constexpr complex<double>(const double &re, const double &im = double()) : double2 {re, im} { }
+  complex() = default;
+  constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { }
 
   template <typename X>
-  constexpr complex<double>(const std::complex<X> &z) :
-    double2 {static_cast<double>(z.real()), static_cast<double>(z.imag())}
+  constexpr complex(const std::complex<X> &z) : double2 {static_cast<double>(z.real()), static_cast<double>(z.imag())}
   {
   }
 
-  template <typename T> __host__ __device__ inline complex<double> &operator=(const complex<T> &z)
+  template <typename T> __host__ __device__ inline complex &operator=(const complex<T> &z)
   {
     real(z.real());
     imag(z.imag());
@@ -572,9 +566,9 @@ template <> struct complex<int8_t> : public char2 {
 public:
   typedef int8_t value_type;
 
-  complex<int8_t>() = default;
+  complex() = default;
 
-  constexpr complex<int8_t>(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { }
+  constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { }
 
   __host__ __device__ inline complex<int8_t> &operator+=(const complex<int8_t> &z)
   {
@@ -608,9 +602,9 @@ struct complex <short> : public short2
 public:
   typedef short value_type;
 
-  complex<short>() = default;
+  complex() = default;
 
-  constexpr complex<short>(const short &re, const short &im = short()) : short2 {re, im} { }
+  constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { }
 
   __host__ __device__ inline complex<short> &operator+=(const complex<short> &z)
   {
@@ -644,9 +638,9 @@ struct complex <int> : public int2
 public:
   typedef int value_type;
 
-  complex<int>() = default;
+  complex() = default;
 
-  constexpr complex<int>(const int &re, const int &im = int()) : int2 {re, im} { }
+  constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { }
 
   __host__ __device__ inline complex<int> &operator+=(const complex<int> &z)
   {
diff --git a/include/dslash.h b/include/dslash.h
index 4b1aada52e..80638b494e 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -67,6 +67,10 @@ namespace quda
       if (arg.xpay) strcat(aux_base, ",xpay");
       if (arg.dagger) strcat(aux_base, ",dagger");
       setRHSstring(aux_base, in.size());
+      strcat(aux_base, ",n_rhs_tile=");
+      char tile_str[16];
+      i32toa(tile_str, Arg::n_src_tile);
+      strcat(aux_base, tile_str);
     }
 
     /**
@@ -329,7 +333,13 @@ namespace quda
 
     Dslash(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
            const ColorSpinorField &halo, const std::string &app_base = "") :
-      TunableKernel3D(in[0], halo.X(4), arg.nParity), arg(arg), out(out), in(in), halo(halo), nDimComms(4), dslashParam(arg)
+      TunableKernel3D(in[0], (halo.X(4) + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity),
+      arg(arg),
+      out(out),
+      in(in),
+      halo(halo),
+      nDimComms(4),
+      dslashParam(arg)
     {
       if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION)
         errorQuda("CPU Fields not supported in Dslash framework yet");
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index df176549fc..0e49e64b68 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -12,12 +12,7 @@
 #include <kernel_helper.h>
 #include <tune_quda.h>
 
-#if defined(_NVHPC_CUDA)
-#include <constant_kernel_arg.h>
-constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE;
-#else
 constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
-#endif
 
 #include <kernel.h>
 
@@ -241,11 +236,12 @@ namespace quda
     return true;
   }
 
-  template <typename Float_, int nDim_> struct DslashArg {
+  template <typename Float_, int nDim_, int n_src_tile_ = 1> struct DslashArg {
 
     using Float = Float_;
     using real = typename mapper<Float>::type;
     static constexpr int nDim = nDim_;
+    static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread
 
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
@@ -269,6 +265,7 @@ namespace quda
     int threadDimMapLower[4];
     int threadDimMapUpper[4];
 
+    int_fastdiv n_src;
     int_fastdiv Ls;
 
     // these are set with symmetric preconditioned twisted-mass dagger
@@ -327,6 +324,7 @@ namespace quda
       exterior_threads(0),
       threadDimMapLower {},
       threadDimMapUpper {},
+      n_src(in.size()),
       Ls(halo.X(4) / in.size()),
       twist_a(0.0),
       twist_b(0.0),
@@ -650,8 +648,9 @@ namespace quda
     Arg arg;
 
     dslash_functor_arg(const Arg &arg, unsigned int threads_x) :
-      kernel_param(dim3(threads_x, arg.dc.Ls, arg.nParity)),
-      arg(arg) { }
+      kernel_param(dim3(threads_x, (arg.dc.Ls + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity)), arg(arg)
+    {
+    }
   };
 
   /**
diff --git a/include/dslash_quda.h b/include/dslash_quda.h
index 38efe07e6c..32111e92f9 100644
--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
@@ -773,9 +773,12 @@ namespace quda
      @param[in] a Scale factor applied to derivative
      @param[in] b Scale factor applied to aux field
      @param[in] x Vector field we accumulate onto to
+     @param[in] parity Destination parity
+     @param[in] comm_override Override for which dimensions are partitioned
+     @param[in] profile The TimeProfile used for profiling the dslash
   */
   void ApplyLaplace(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
-                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity,
                     const int *comm_override, TimeProfile &profile);
 
   /**
diff --git a/include/eigen_helper.h b/include/eigen_helper.h
index 8c1bf012a8..373f9ed865 100644
--- a/include/eigen_helper.h
+++ b/include/eigen_helper.h
@@ -5,10 +5,6 @@
 #define EIGEN_USE_BLAS
 #endif
 
-#if defined(__NVCOMPILER) // WAR for nvc++ until we update to latest Eigen
-#define EIGEN_DONT_VECTORIZE
-#endif
-
 #include <math.h>
 
 // hide annoying warning
diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h
index ee01a51b32..4fcea4b3d3 100644
--- a/include/eigensolve_quda.h
+++ b/include/eigensolve_quda.h
@@ -148,7 +148,7 @@ namespace quda
        @param[in] out Output spinor
        @param[in] in Input spinor
     */
-    double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in);
+    virtual double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in);
 
     /**
        @brief Orthogonalise input vectors r against
@@ -263,22 +263,10 @@ namespace quda
        @brief Compute eigenvalues and their residiua
        @param[in] mat Matrix operator
        @param[in] evecs The eigenvectors
-       @param[in] evals The eigenvalues
+       @param[in,out] evals The eigenvalues
        @param[in] size The number of eigenvalues to compute
     */
-    void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals,
-                      int size);
-
-    /**
-       @brief Compute eigenvalues and their residiua.  This variant compute the number of converged eigenvalues.
-       @param[in] mat Matrix operator
-       @param[in] evecs The eigenvectors
-       @param[in] evals The eigenvalues
-    */
-    void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals)
-    {
-      computeEvals(evecs, evals, n_conv);
-    }
+    virtual void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, int size = 0);
 
     /**
        @brief Load and check eigenpairs from file
@@ -461,6 +449,116 @@ namespace quda
     void computeBlockKeptRitz(std::vector<ColorSpinorField> &kSpace);
   };
 
+  /**
+     @brief Thick Restarted Lanczos Method for 3D slices
+  */
+  class TRLM3D : public EigenSolver
+  {
+    bool verbose_rank = false; /** Whether this rank is one that logs */
+
+  public:
+    /**
+       @brief Constructor for Thick Restarted Eigensolver class
+       @param eig_param The eigensolver parameters
+       @param mat The operator to solve
+    */
+    TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param);
+
+    /**
+       @return Whether the solver is only for Hermitian systems
+    */
+    virtual bool hermitian() override { return true; } /** TRLM3D is only for Hermitian systems */
+
+    // Variable size matrix (for the 3D problem)
+    std::vector<std::vector<double>> ritz_mat_3D;
+
+    // Arrays for 3D residua
+    std::vector<std::vector<double>> residua_3D;
+
+    // Array for convergence
+    std::vector<bool> converged_3D;
+    std::vector<bool> active_3D;
+    std::vector<int> iter_locked_3D;
+    std::vector<int> iter_keep_3D;
+    std::vector<int> iter_converged_3D;
+    std::vector<int> num_locked_3D;
+    std::vector<int> num_keep_3D;
+    std::vector<int> num_converged_3D;
+
+    // Tridiagonal/Arrow matrices, fixed size (for the 3D problem)
+    std::vector<std::vector<double>> alpha_3D;
+    std::vector<std::vector<double>> beta_3D;
+
+    // The orthogonal direction and size in the 3D problem
+    int ortho_dim;
+    int ortho_dim_size;
+
+    /**
+       @brief Compute eigenpairs
+       @param[in] kSpace Krylov vector space
+       @param[in] evals Computed eigenvalues
+    */
+    void operator()(std::vector<ColorSpinorField> &kSpace, std::vector<Complex> &evals) override;
+
+    /**
+       @brief Lanczos step: extends the Krylov space.
+       @param[in] v Vector space
+       @param[in] j Index of vector being computed
+    */
+    void lanczosStep3D(std::vector<ColorSpinorField> &v, int j);
+
+    /**
+       @brief Reorder the Krylov space by eigenvalue
+       @param[in] kSpace the Krylov space
+    */
+    void reorder3D(std::vector<ColorSpinorField> &kSpace);
+
+    /**
+       @brief Get the eigendecomposition from the arrow matrix
+    */
+    void eigensolveFromArrowMat3D();
+
+    /**
+       @brief Rotate the Ritz vectors using the arrow matrix eigendecomposition
+       @param[in] nKspace current Krylov space
+    */
+    void computeKeptRitz3D(std::vector<ColorSpinorField> &kSpace);
+
+    /**
+       @brief Orthogonalise input vectors r against
+       vector space v using block-BLAS
+       @param[in] v Vector space
+       @param[in] r Vectors to be orthogonalised
+       @param[in] j Use vectors v[0:j]
+       @param[in] s array of
+    */
+    void blockOrthogonalize3D(std::vector<ColorSpinorField> &v, std::vector<ColorSpinorField> &r, int j);
+
+    /**
+       @brief Check for an initial guess. If none present, populate with rands, then
+       orthonormalise
+       @param[in] kSpace The Krylov space vectors
+    */
+    void prepareInitialGuess3D(std::vector<ColorSpinorField> &kSpace, int ortho_dim_size);
+
+    /**
+       @brief Estimate the spectral radius of the operator for the max value of the
+       Chebyshev polynomial
+       @param[in] mat Matrix operator
+       @param[in] out Output spinor
+       @param[in] in Input spinor
+    */
+    double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) override;
+
+    /**
+       @brief Compute eigenvalues and their residiua
+       @param[in] evecs The eigenvectors
+       @param[in,out] evals The eigenvalues
+       @param[in] size The number of eigenvalues to compute
+    */
+    void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, int size = 0) override;
+  };
+
   /**
      @brief Implicitly Restarted Arnoldi Method.
   */
diff --git a/include/enum_quda.h b/include/enum_quda.h
index f281d33feb..462ce98cf0 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -138,6 +138,7 @@ typedef enum QudaInverterType_s {
 typedef enum QudaEigType_s {
   QUDA_EIG_TR_LANCZOS,     // Thick restarted lanczos solver
   QUDA_EIG_BLK_TR_LANCZOS, // Block Thick restarted lanczos solver
+  QUDA_EIG_TR_LANCZOS_3D,  // Thick restarted lanczos solver for 3-d systems
   QUDA_EIG_IR_ARNOLDI,     // Implicitly Restarted Arnoldi solver
   QUDA_EIG_BLK_IR_ARNOLDI, // Block Implicitly Restarted Arnoldi solver
   QUDA_EIG_INVALID = QUDA_INVALID_ENUM
@@ -277,8 +278,6 @@ typedef enum QudaVerbosity_s {
   QUDA_INVALID_VERBOSITY = QUDA_INVALID_ENUM
 } QudaVerbosity;
 
-typedef enum QudaTune_s { QUDA_TUNE_NO, QUDA_TUNE_YES, QUDA_TUNE_INVALID = QUDA_INVALID_ENUM } QudaTune;
-
 typedef enum QudaPreserveDirac_s {
   QUDA_PRESERVE_DIRAC_NO,
   QUDA_PRESERVE_DIRAC_YES,
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index 596232a7b3..8874959d7b 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -122,8 +122,9 @@
 #define QudaEigType integer(4)
 #define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver
 #define QUDA_EIG_BLK_IR_LANCZOS 1 // Block Thick Restarted Lanczos Solver
-#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver
-#define QUDA_EIG_BLK_IR_ARNOLDI 3 // Block Implicitly restarted Arnoldi solver (not yet implemented)
+#define QUDA_EIG_TR_LANCZOS_3D 2  // Thick Restarted Lanczos Solver for 3-d systems
+#define QUDA_EIG_IR_ARNOLDI 3     // Implicitly restarted Arnoldi solver
+#define QUDA_EIG_BLK_IR_ARNOLDI 4 // Block Implicitly restarted Arnoldi solver (not yet implemented)
 #define QUDA_EIG_INVALID QUDA_INVALID_ENUM
 
 #define QudaEigSpectrumType integer(4)
@@ -247,11 +248,6 @@
 #define QUDA_DEBUG_VERBOSE 3
 #define QUDA_INVALID_VERBOSITY QUDA_INVALID_ENUM
 
-#define QudaTune integer(4)
-#define QUDA_TUNE_NO 0
-#define QUDA_TUNE_YES 1
-#define QUDA_TUNE_INVALID QUDA_INVALID_ENUM
-
 #define QudaPreserveDirac integer(4)
 #define QUDA_PRESERVE_DIRAC_NO 0
 #define QUDA_PRESERVE_DIRAC_YES 1
diff --git a/include/field_cache.h b/include/field_cache.h
index 5e05f9acf6..4def6faa3a 100644
--- a/include/field_cache.h
+++ b/include/field_cache.h
@@ -15,7 +15,7 @@ namespace quda {
    */
   template <typename T>
   struct FieldKey {
-    std::string volume; /** volume kstring */
+    std::string volume; /** volume string */
     std::string aux;    /** auxiliary string */
 
     FieldKey() = default;
@@ -78,6 +78,18 @@ namespace quda {
      */
     FieldTmp(const FieldKey<T> &key, const typename T::param_type &param);
 
+    /**
+       @brief Create a field temporary that corresponds to the field
+       constructed from the param struct.  If a matching field is
+       present in the cache, it will be popped from the cache.  If no
+       such temporary exists a temporary will be allocated.
+       @param[in] key Key corresponding to the field instance we
+       require
+       @param[in] param Parameter structure used to allocated
+       the temporary
+     */
+    FieldTmp(typename T::param_type param);
+
     /**
        @brief Copy constructor is deleted to prevent accidental cache
        bloat
@@ -111,6 +123,18 @@ namespace quda {
    */
   template <typename T> auto getFieldTmp(const T &a) { return FieldTmp<T>(a); }
 
+  /**
+     @brief Get a field temporary that is identical to the field
+     instance argument.  If a matching field is present in the cache,
+     it will be popped from the cache.  If no such temporary exists, a
+     temporary will be allocated.  When the destructor for the
+     FieldTmp is called, e.g., the returned object goes out of scope,
+     the temporary will be pushed onto the cache.
+
+     @param[in] a Field we wish to create a matching temporary for
+   */
+  template <typename T> auto getFieldTmp(const typename T::param_type &param) { return FieldTmp<T>(param); }
+
   /**
      @brief Get a vector of field temporaries that are identical to
      the vector instance argument.  If enough matching fields are
diff --git a/include/gauge_field.h b/include/gauge_field.h
index a62b6a684f..776681e44b 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -250,12 +250,12 @@ namespace quda {
     */
     void setTuningString();
 
+  public:
     /**
        @brief Initialize the padded region to 0
      */
     void zeroPad();
 
-  public:
     /**
        @brief Default constructor
     */
@@ -455,7 +455,7 @@ namespace quda {
     std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, T> data() const
     {
       if (is_pointer_array(order)) errorQuda("Non dim-array ordered field requested but order is %d", order);
-      return reinterpret_cast<T>(gauge.data());
+      return static_cast<T>(gauge.data());
     }
 
     /**
@@ -473,7 +473,7 @@ namespace quda {
                     "data() requires a pointer cast type");
       if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry);
       if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
-      return reinterpret_cast<T>(gauge_array[d].data());
+      return static_cast<T>(gauge_array[d].data());
     }
 
     void *raw_pointer() const
@@ -500,7 +500,7 @@ namespace quda {
     {
       if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
       array<T, QUDA_MAX_DIM> u = {};
-      for (auto d = 0; d < geometry; d++) u[d] = static_cast<T>(gauge_array[d]);
+      for (auto d = 0; d < geometry; d++) u[d] = static_cast<T>(gauge_array[d].data());
       return u;
     }
 
@@ -651,9 +651,28 @@ namespace quda {
       }
     }
 
+    /**
+     * @brief Print the site data
+     * @param[in] parity Parity index
+     * @param[in] dim The dimension in which we are printing
+     * @param[in] x_cb Checkerboard space-time index
+     * @param[in] rank The rank we are requesting from (default is rank = 0)
+     */
+    void PrintMatrix(int dim, int parity, unsigned int x_cb, int rank = 0) const;
+
     friend struct GaugeFieldParam;
   };
 
+  /**
+     @brief Print the value of the field at the requested coordinates
+     @param[in] a The field we are printing from
+     @param[in] dim The dimension in which we are printing
+     @param[in] parity Parity index
+     @param[in] x_cb Checkerboard space-time index
+     @param[in] rank The rank we are requesting from (default is rank = 0)
+  */
+  void genericPrintMatrix(const GaugeField &a, int dim, int parity, unsigned int x_cb, int rank = 0);
+
   /**
      @brief This is a debugging function, where we cast a gauge field
      into a spinor field so we can compute its L1 norm.
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index a8809d018f..57992908c2 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -153,8 +153,11 @@ namespace quda {
 
     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool fixed_point() { return false; }
     template <> __host__ __device__ inline constexpr bool fixed_point<float, int8_t>() { return true; }
-    template<> __host__ __device__ inline constexpr bool fixed_point<float,short>() { return true; }
-    template<> __host__ __device__ inline constexpr bool fixed_point<float,int>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<float, short>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<float, int>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<double, int8_t>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<double, short>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<double, int>() { return true; }
 
     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool match() { return false; }
     template<> __host__ __device__ inline constexpr bool match<int,int>() { return true; }
@@ -377,7 +380,7 @@ namespace quda {
       {
         for (int d = 0; d < U.Geometry(); d++)
           u[d] = gauge_ ? static_cast<complex<storeFloat> **>(gauge_)[d] : U.data<complex<storeFloat> *>(d);
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -459,7 +462,7 @@ namespace quda {
           ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
         }
 
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -494,7 +497,7 @@ namespace quda {
         scale(static_cast<Float>(1.0)),
         scale_inv(static_cast<Float>(1.0))
       {
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -583,7 +586,7 @@ namespace quda {
           ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
         }
 
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -636,7 +639,7 @@ namespace quda {
         scale(static_cast<Float>(1.0)),
         scale_inv(static_cast<Float>(1.0))
       {
-	resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -715,7 +718,7 @@ namespace quda {
             nullptr;
           ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d);
         }
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -745,8 +748,8 @@ namespace quda {
        @tparam nSpinCoarse Number of "spin degrees of freedom" (for coarse-link fields only)
        @tparam order Storage order of the field
        @tparam native_ghost Whether to use native ghosts (inlined into
+               the padded area for internal-order fields or use a separate array if false)
        @tparam storeFloat_ Underlying storage type for the field
-       the padded area for internal-order fields or use a separate array if false)
      */
     template <typename Float_, int nColor, int nSpinCoarse, QudaGaugeFieldOrder order, bool native_ghost = true,
               typename storeFloat_ = Float_>
@@ -1478,7 +1481,7 @@ namespace quda {
             z *= scale;
 #pragma unroll
             for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]);
-          } else { // stagic phase
+          } else { // static phase
 #pragma unroll
             for (int i = 0; i < 9; i++) { out[i] *= phase; }
           }
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index 990ff949e7..cf1faf72b0 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -1106,4 +1106,26 @@ namespace quda {
     return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
   }
 
+  /**
+     @brief Compute the flattened 4-d index from a separate T and
+     flattened 3-d XYZ index.
+     @param[in] t Temporal index
+     @param[in] xyz Flattened 3-d index
+     @param[in] X Lattice dimensions
+     @return The flattened 4-d index
+   */
+  template <int reduction_dim, class T> __device__ int idx_from_t_xyz(int t, int xyz, T X[4])
+  {
+    int x[4];
+#pragma unroll
+    for (int d = 0; d < 4; d++) {
+      if (d != reduction_dim) {
+        x[d] = xyz % X[d];
+        xyz = xyz / X[d];
+      }
+    }
+    x[reduction_dim] = t;
+    return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);
+  }
+
 } // namespace quda
diff --git a/include/kernels/blas_3d.cuh b/include/kernels/blas_3d.cuh
new file mode 100644
index 0000000000..e7426e89af
--- /dev/null
+++ b/include/kernels/blas_3d.cuh
@@ -0,0 +1,331 @@
+#pragma once
+
+#include <color_spinor_field_order.h>
+#include <index_helper.cuh>
+#include <fast_intdiv.h>
+#include <quda_matrix.h>
+#include <matrix_field.h>
+#include <constant_kernel_arg.h>
+#include <reduce_helper.h>
+#include <kernel.h>
+
+namespace quda
+{
+
+  struct baseArg : kernel_param<> {
+    int_fastdiv X[4]; // grid dimensions
+
+    baseArg(dim3 threads, const ColorSpinorField &x) : kernel_param(threads)
+    {
+      for (int i = 0; i < 4; i++) { X[i] = x.X()[i]; }
+      if (x.SiteSubset() == 1) X[0] = 2 * X[0];
+    }
+  };
+
+  template <typename Float, int nColor_> struct copy3dArg : baseArg {
+    using real = typename mapper<Float>::type;
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load>::type F;
+
+    F y;
+    F x;
+    const int slice;
+
+    copy3dArg(ColorSpinorField &y, ColorSpinorField &x, int slice) :
+      baseArg(dim3(y.VolumeCB(), y.SiteSubset(), 1), y), y(y), x(x), slice(slice)
+    {
+    }
+  };
+
+  template <typename Arg> struct copyTo3d {
+    const Arg &arg;
+    constexpr copyTo3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      // Isolate the slice of the 4D array
+      int idx[4];
+      getCoords(idx, x_cb, arg.X, parity);
+
+      if (idx[3] == arg.slice) {
+        using Vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
+
+        // Get 4D data
+        Vector y = arg.y(x_cb, parity);
+        // Get 3D location
+        int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]);
+
+        // Write to 3D
+        arg.x(xyz / 2, xyz % 2) = y;
+      }
+    }
+  };
+
+  template <typename Arg> struct copyFrom3d {
+    const Arg &arg;
+    constexpr copyFrom3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      int idx[4] = {};
+      getCoords(idx, x_cb, arg.X, parity);
+
+      if (idx[3] == arg.slice) {
+        using Vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
+
+        // Get 3D location
+        int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]);
+        Vector x = arg.x(xyz / 2, xyz % 2);
+        // Write to 4D
+        arg.y(x_cb, parity) = x;
+      }
+    }
+  };
+
+  template <typename Arg> struct swap3d {
+    const Arg &arg;
+    constexpr swap3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      int idx[4] = {};
+      getCoords(idx, x_cb, arg.X, parity);
+
+      if (idx[3] == arg.slice) {
+        using Vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
+        Vector x = arg.x(x_cb, parity);
+        Vector y = arg.y(x_cb, parity);
+        arg.x(x_cb, parity) = y;
+        arg.y(x_cb, parity) = x;
+      }
+    }
+  };
+
+  template <typename Float, int nColor_> struct axpby3dArg : baseArg {
+    using real = typename mapper<Float>::type;
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+    static constexpr bool disable_ghost = true;
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;
+
+    static constexpr int MAX_ORTHO_DIM = 256;
+    real a[MAX_ORTHO_DIM];
+    const F x;
+    real b[MAX_ORTHO_DIM];
+    F y;
+
+    axpby3dArg(const std::vector<double> &a, const ColorSpinorField &x, const std::vector<double> &b,
+               ColorSpinorField &y) :
+      baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y)
+    {
+      if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM);
+      for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i];
+      for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i];
+    }
+  };
+
+  template <typename Arg> struct axpby3d {
+    const Arg &arg;
+    constexpr axpby3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      int idx[4];
+      getCoords(idx, x_cb, arg.X, parity);
+
+      Vector x = arg.x(x_cb, parity);
+      Vector y = arg.y(x_cb, parity);
+
+      // Get the coeffs for this dim slice
+      real a = arg.a[idx[3]];
+      real b = arg.b[idx[3]];
+
+      // Compute the axpby
+      Vector out = a * x + b * y;
+
+      // Write to y
+      arg.y(x_cb, parity) = out;
+    }
+  };
+
+  template <typename Float, int nColor_> struct caxpby3dArg : baseArg {
+    using real = typename mapper<Float>::type;
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+    static constexpr bool disable_ghost = true;
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;
+
+    static constexpr int MAX_ORTHO_DIM = 256;
+    complex<real> a[MAX_ORTHO_DIM];
+    const F x;
+    complex<real> b[MAX_ORTHO_DIM];
+    F y;
+
+    caxpby3dArg(const std::vector<Complex> &a, const ColorSpinorField &x, const std::vector<Complex> &b,
+                ColorSpinorField &y) :
+      baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y)
+    {
+      if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM);
+      for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i];
+      for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i];
+    }
+  };
+
+  template <typename Arg> struct caxpby3d {
+    const Arg &arg;
+    constexpr caxpby3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      int idx[4];
+      getCoords(idx, x_cb, arg.X, parity);
+
+      Vector x = arg.x(x_cb, parity);
+      Vector y = arg.y(x_cb, parity);
+
+      // Get the coeffs for this dim slice
+      complex<real> a = arg.a[idx[3]];
+      complex<real> b = arg.b[idx[3]];
+
+      // Compute the caxpby
+      Vector out = a * x + b * y;
+
+      // Write to y
+      arg.y(x_cb, parity) = out;
+    }
+  };
+
+  template <typename Float, int nColor_> struct reDotProduct3dArg : public ReduceArg<double> {
+    using real = typename mapper<Float>::type;
+    static constexpr int reduction_dim = 3; // DMH Template this
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load>::type F;
+
+    F x;
+    F y;
+    int_fastdiv Xh[4]; // checkerboard grid dimensions
+
+    reDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) :
+      ReduceArg<double>(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]),
+                        x.X()[reduction_dim]),
+      x(x),
+      y(y)
+    {
+      for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i];
+    }
+
+    __device__ __host__ double init() const { return double(); }
+  };
+
+  template <typename Arg> struct reDotProduct3d : plus<double> {
+    using reduce_t = double;
+    using plus<reduce_t>::operator();
+    static constexpr int reduce_block_dim = 2;
+    const Arg &arg;
+    constexpr reDotProduct3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      // Collect vector data
+      int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh);
+
+      Vector x = arg.x(idx_cb, parity);
+      Vector y = arg.y(idx_cb, parity);
+
+      // Get the inner product
+      reduce_t sum = innerProduct(x, y).real();
+
+      // Apply reduction to t bucket
+      return plus::operator()(sum, result);
+    }
+  };
+
+  template <typename Float, int nColor_> struct cDotProduct3dArg : public ReduceArg<array<double, 2>> {
+    using real = typename mapper<Float>::type;
+    static constexpr int reduction_dim = 3; // DMH Template this
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load>::type F;
+
+    F x;
+    F y;
+    int_fastdiv X[4];  // grid dimensions
+    int_fastdiv Xh[4]; // checkerboard grid dimensions
+
+    cDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) :
+      ReduceArg<array<double, 2>>(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]),
+                                  x.X()[reduction_dim]),
+      x(x),
+      y(y)
+    {
+      for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i];
+    }
+
+    __device__ __host__ reduce_t init() const { return {0.0, 0.0}; }
+  };
+
+  template <typename Arg> struct cDotProduct3d : plus<array<double, 2>> {
+    using reduce_t = array<double, 2>;
+    using plus<reduce_t>::operator();
+    static constexpr int reduce_block_dim = 2;
+
+    const Arg &arg;
+    constexpr cDotProduct3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      // Collect vector data
+      int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh);
+
+      Vector x = arg.x(idx_cb, parity);
+      Vector y = arg.y(idx_cb, parity);
+
+      // Get the inner product
+      complex<double> res = innerProduct(x, y);
+
+      // Apply reduction to temporal bucket
+      return plus::operator()({res.real(), res.imag()}, result);
+    }
+  };
+} // namespace quda
diff --git a/include/kernels/contraction.cuh b/include/kernels/contraction.cuh
index 96079ed041..815da471d2 100644
--- a/include/kernels/contraction.cuh
+++ b/include/kernels/contraction.cuh
@@ -32,20 +32,6 @@ namespace quda
     return ((sink[3] * X[2] + sink[2]) * X[1] + sink[1]) * X[0] + sink[0];
   }
 
-  template <int reduction_dim, class T> __device__ int idx_from_t_xyz(int t, int xyz, T X[4])
-  {
-    int x[4];
-#pragma unroll
-    for (int d = 0; d < 4; d++) {
-      if (d != reduction_dim) {
-        x[d] = xyz % X[d];
-        xyz /= X[d];
-      }
-    }
-    x[reduction_dim] = t;
-    return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);
-  }
-
   template <typename Float, int nColor_, int nSpin_ = 4, int reduction_dim_ = 3, typename contract_t = spinor_array>
   struct ContractionSummedArg : public ReduceArg<contract_t> {
     using reduce_t = contract_t;
diff --git a/include/kernels/copy_field_offset.cuh b/include/kernels/copy_field_offset.cuh
index ea4a603bfd..5f614b2e49 100644
--- a/include/kernels/copy_field_offset.cuh
+++ b/include/kernels/copy_field_offset.cuh
@@ -142,12 +142,12 @@ namespace quda
         if (arg.mode == QudaOffsetCopyMode::COLLECT) {
           // we are collecting so x_cb is the index for the input.
           idx_in = x_cb;
-          getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.array);
+          getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.data());
           idx_out = linkIndex(coordinate, arg.dim_out);
         } else {
           // we are dispersing so x_cb is the index for the output.
           idx_out = x_cb;
-          getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.array);
+          getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.data());
           idx_in = linkIndex(coordinate, arg.dim_in);
         }
         copy_field(s * arg.volume_4d_cb_out + idx_out, s * arg.volume_4d_cb_in + idx_in, parity, arg);
diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh
index 799e511b44..62bf0d27d6 100644
--- a/include/kernels/dslash_domain_wall_m5.cuh
+++ b/include/kernels/dslash_domain_wall_m5.cuh
@@ -212,7 +212,7 @@ namespace quda
   template <bool sync, bool dagger, bool shared, class Vector, class Arg, Dslash5Type type = Arg::type>
   __device__ __host__ inline Vector d5(const Arg &arg, const Vector &in, int parity, int x_cb, int s, int src_idx)
   {
-
+    int local_src_idx = target::thread_idx().y / arg.Ls;
     using real = typename Arg::real;
     constexpr bool is_variable = true;
     coeff_type<real, is_variable, Arg> coeff(arg);
@@ -235,7 +235,7 @@ namespace quda
         const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb;
         HalfVector half_in;
         if (shared) {
-          half_in = cache.load(threadIdx.x, fwd_s, parity);
+          half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity);
         } else {
           Vector full_in = arg.in[src_idx](fwd_idx, parity);
           half_in = full_in.project(4, proj_dir);
@@ -258,7 +258,7 @@ namespace quda
         const int back_idx = back_s * arg.volume_4d_cb + x_cb;
         HalfVector half_in;
         if (shared) {
-          half_in = cache.load(threadIdx.x, back_s, parity);
+          half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity);
         } else {
           Vector full_in = arg.in[src_idx](back_idx, parity);
           half_in = full_in.project(4, proj_dir);
@@ -283,7 +283,8 @@ namespace quda
       { // forwards direction
         const int fwd_s = (s + 1) % arg.Ls;
         const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb;
-        const Vector in = shared ? cache.load(threadIdx.x, fwd_s, parity) : arg.in[src_idx](fwd_idx, parity);
+        const Vector in
+          = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity) : arg.in[src_idx](fwd_idx, parity);
         constexpr int proj_dir = dagger ? +1 : -1;
         if (s == arg.Ls - 1) {
           out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
@@ -295,7 +296,8 @@ namespace quda
       { // backwards direction
         const int back_s = (s + arg.Ls - 1) % arg.Ls;
         const int back_idx = back_s * arg.volume_4d_cb + x_cb;
-        const Vector in = shared ? cache.load(threadIdx.x, back_s, parity) : arg.in[src_idx](back_idx, parity);
+        const Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity) :
+                                   arg.in[src_idx](back_idx, parity);
         constexpr int proj_dir = dagger ? -1 : +1;
         if (s == 0) {
           out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
@@ -378,6 +380,7 @@ namespace quda
   __device__ __host__ inline Vector constantInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_,
                                                 int src_idx)
   {
+    int local_src_idx = target::thread_idx().y / arg.Ls;
     using real = typename Arg::real;
     const auto k = arg.kappa;
     const auto inv = arg.inv;
@@ -395,7 +398,8 @@ namespace quda
 
     for (int s = 0; s < arg.Ls; s++) {
 
-      Vector in = shared ? cache.load(threadIdx.x, s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+      Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+                           arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
 
       {
         int exp = s_ < s ? arg.Ls - s + s_ : s_ - s;
@@ -436,6 +440,7 @@ namespace quda
   __device__ __host__ inline Vector variableInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_,
                                                 int src_idx)
   {
+    int local_src_idx = target::thread_idx().y / arg.Ls;
     constexpr int nSpin = 4;
     using real = typename Arg::real;
     typedef ColorSpinor<real, Arg::nColor, nSpin / 2> HalfVector;
@@ -461,7 +466,7 @@ namespace quda
           auto factorR = (s_ < s ? -arg.m_f * R : R);
 
           if (shared) {
-            r += factorR * cache.load(threadIdx.x, s, parity);
+            r += factorR * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity);
           } else {
             Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
             r += factorR * in.project(4, proj_dir);
@@ -489,7 +494,7 @@ namespace quda
           auto factorL = (s_ > s ? -arg.m_f * L : L);
 
           if (shared) {
-            l += factorL * cache.load(threadIdx.x, s, parity);
+            l += factorL * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity);
           } else {
             Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
             l += factorL * in.project(4, proj_dir);
@@ -518,7 +523,8 @@ namespace quda
         for (int s_count = 0; s_count < arg.Ls; s_count++) {
           auto factorR = (s_ < s ? -arg.m_f * R : R);
 
-          Vector in = shared ? cache.load(threadIdx.x, s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+          Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+                               arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
           r += factorR * in.project(4, proj_dir);
 
           R *= coeff.kappa(s);
@@ -537,7 +543,8 @@ namespace quda
         for (int s_count = 0; s_count < arg.Ls; s_count++) {
           auto factorL = (s_ > s ? -arg.m_f * L : L);
 
-          Vector in = shared ? cache.load(threadIdx.x, s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+          Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+                               arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
           l += factorL * in.project(4, proj_dir);
 
           L *= coeff.kappa(s);
diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh
index 94e0465238..a6a5b3c1d5 100644
--- a/include/kernels/dslash_pack.cuh
+++ b/include/kernels/dslash_pack.cuh
@@ -11,8 +11,10 @@ namespace quda
 {
   int *getPackComms();
 
-  template <typename Float_, int nColor_, int nSpin_, bool spin_project_ = true,
-            bool dagger_ = false, int twist_ = 0, QudaPCType pc_type_ = QUDA_4D_PC>
+  constexpr int pack_tile_size = 1;
+
+  template <typename Float_, int nColor_, int nSpin_, bool spin_project_ = true, bool dagger_ = false, int twist_ = 0,
+            QudaPCType pc_type_ = QUDA_4D_PC, int n_src_tile_ = pack_tile_size>
   struct PackArg : kernel_param<> {
 
     typedef Float_ Float;
@@ -24,6 +26,7 @@ namespace quda
     static constexpr bool dagger = dagger_;
     static constexpr int twist = twist_; // whether we are doing preconditioned twisted-mass or not (1 - singlet, 2 - doublet)
     static constexpr QudaPCType pc_type = pc_type_; // preconditioning type (4-d or 5-d)
+    static constexpr int n_src_tile = n_src_tile_;
 
     static constexpr bool spinor_direct_load = false; // false means texture load
 
@@ -54,6 +57,7 @@ namespace quda
 
     int sites_per_block;
 
+    int_fastdiv n_src;
     int_fastdiv Ls;
 
     char *packBuffer[4 * QUDA_MAX_DIM];
@@ -96,6 +100,7 @@ namespace quda
       threadDimMapLower {},
       threadDimMapUpper {},
       sites_per_block((work_items + grid - 1) / grid),
+      n_src(in.size()),
       Ls(halo.X(4) / in.size())
 #ifdef NVSHMEM_COMMS
       ,
@@ -131,7 +136,7 @@ namespace quda
     }
   };
 
-  template <bool dagger, int twist, int dim, QudaPCType pc, typename Arg>
+  template <bool dagger, int twist, int dim, QudaPCType pc, int n_src_tile, typename Arg>
   __device__ __host__ inline void pack(const Arg &arg, int ghost_idx, int s, int parity, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
@@ -160,46 +165,52 @@ namespace quda
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 0>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? +1 : -1;
-      Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity);
-      if (twist == 1) {
-        f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
-      } else if (twist == 2) {
-        Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
-        if (s == 0)
-          f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-        else
-          f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-      }
-      if (arg.spin_project) {
-        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
-          = f.project(dim, proj_dir);
-      } else {
-        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
+        if (twist == 1) {
+          f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
+        } else if (twist == 2) {
+          Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
+          if (s == 0)
+            f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+          else
+            f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+        }
+        if (arg.spin_project) {
+          arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
+            = f.project(dim, proj_dir);
+        } else {
+          arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+        }
       }
     } else { // forwards
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 1>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? -1 : +1;
-      Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity);
-      if (twist == 1) {
-        f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
-      } else if (twist == 2) {
-        Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
-        if (s == 0)
-          f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-        else
-          f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-      }
-      if (arg.spin_project) {
-        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
-          = f.project(dim, proj_dir);
-      } else {
-        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
+        if (twist == 1) {
+          f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
+        } else if (twist == 2) {
+          Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
+          if (s == 0)
+            f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+          else
+            f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+        }
+        if (arg.spin_project) {
+          arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
+            = f.project(dim, proj_dir);
+        } else {
+          arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+        }
       }
     }
   }
 
-  template <int dim, int nFace = 1, typename Arg>
+  template <int dim, int nFace = 1, int n_src_tile, typename Arg>
   __device__ __host__ inline void packStaggered(const Arg &arg, int ghost_idx, int parity, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
@@ -217,12 +228,18 @@ namespace quda
 
     if (face_num == 0) { // backwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg);
-      Vector f = arg.in[src_idx](idx, spinor_parity);
-      arg.halo_pack.Ghost(dim, 0, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 0, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      }
     } else { // forwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg);
-      Vector f = arg.in[src_idx](idx, spinor_parity);
-      arg.halo_pack.Ghost(dim, 1, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 1, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      }
     }
   }
 
@@ -250,24 +267,36 @@ namespace quda
         if (Arg::pc_type == QUDA_5D_PC) { // 5-d checkerboarded, include s (not ghostFaceCB since both faces)
           switch (dim) {
           case 0:
-            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0,
+                                                                            parity, src_idx);
             break;
           case 1:
-            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0,
+                                                                            parity, src_idx);
             break;
           case 2:
-            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0,
+                                                                            parity, src_idx);
             break;
           case 3:
-            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0,
+                                                                            parity, src_idx);
             break;
           }
         } else { // 4-d checkerboarding, keeping s separate (if it exists)
           switch (dim) {
-          case 0: pack<Arg::dagger, Arg::twist, 0, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 1: pack<Arg::dagger, Arg::twist, 1, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 2: pack<Arg::dagger, Arg::twist, 2, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 3: pack<Arg::dagger, Arg::twist, 3, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
+          case 0:
+            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 1:
+            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 2:
+            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 3:
+            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
           }
         }
 
@@ -288,7 +317,8 @@ namespace quda
   // 64 - use uber kernel (merge exterior)
   template <bool dagger, QudaPCType pc, typename Arg> struct packShmem {
 
-    template <int twist> __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity)
+    template <int twist, int n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity)
     {
       // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
       int local_block_idx = target::block_idx().x % arg.blocks_per_dir;
@@ -315,9 +345,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[0]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[0] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 0, pc>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
+              pack<dagger, twist, 0, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
             else
-              pack<dagger, twist, 0, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 0, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -325,9 +355,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[1]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[1] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 1, pc>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
+              pack<dagger, twist, 1, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
             else
-              pack<dagger, twist, 1, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 1, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -335,9 +365,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[2]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[2] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 2, pc>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
+              pack<dagger, twist, 2, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
             else
-              pack<dagger, twist, 2, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 2, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -345,9 +375,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[3]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[3] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 3, pc>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
+              pack<dagger, twist, 3, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
             else
-              pack<dagger, twist, 3, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 3, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -358,12 +388,19 @@ namespace quda
 #endif
     }
 
-    __device__ __forceinline__ void operator()(const Arg &arg, int s, int parity, int twist_pack)
+    template <int n_src_tile = Arg::n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_s_block_, int parity, int twist_pack)
     {
-      switch (twist_pack) {
-      case 0: this->operator()<0>(arg, s, parity); break;
-      case 1: this->operator()<1>(arg, s, parity); break;
-      case 2: this->operator()<2>(arg, s, parity); break;
+      int src_s_block = MAX_MULTI_RHS == 1 ? 0 : src_s_block_;
+      int src_s_idx = src_s_block * Arg::n_src_tile;
+      if (src_s_idx + n_src_tile * arg.Ls <= arg.n_src * arg.Ls) {
+        switch (twist_pack) {
+        case 0: this->operator()<0, n_src_tile>(arg, src_s_idx, parity); break;
+        case 1: this->operator()<1, n_src_tile>(arg, src_s_idx, parity); break;
+        case 2: this->operator()<2, n_src_tile>(arg, src_s_idx, parity); break;
+        }
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<n_src_tile - 1>(arg, src_s_block, parity);
       }
     }
   };
@@ -377,7 +414,7 @@ namespace quda
     {
       if (arg.nParity == 1) parity = arg.parity;
       packShmem<Arg::dagger, Arg::pc_type, Arg> pack;
-      pack.operator()<Arg::twist>(arg, s, parity);
+      pack.operator()(arg, s, parity, Arg::twist);
     }
   };
 
@@ -400,17 +437,17 @@ namespace quda
 
         if (arg.nFace == 1) {
           switch (dim) {
-          case 0: packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 1: packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 2: packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 3: packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); break;
+          case 0: packStaggered<0, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 1: packStaggered<1, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 2: packStaggered<2, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 3: packStaggered<3, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
           }
         } else if (arg.nFace == 3) {
           switch (dim) {
-          case 0: packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 1: packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 2: packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 3: packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); break;
+          case 0: packStaggered<0, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 1: packStaggered<1, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 2: packStaggered<2, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 3: packStaggered<3, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
           }
         }
 
@@ -422,7 +459,7 @@ namespace quda
 
   template <bool dagger, QudaPCType pc, typename Arg> struct packStaggeredShmem {
 
-    __device__ __forceinline__ void operator()(const Arg &arg, int src_idx, int parity, int = 0)
+    template <int n_src_tile> __device__ __forceinline__ void apply(const Arg &arg, int src_idx, int parity)
     {
       // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
       int local_block_idx = target::block_idx().x % arg.blocks_per_dir;
@@ -446,9 +483,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[0]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[0] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<0, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<0, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<0, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<0, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -456,9 +493,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[1]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[1] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<1, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<1, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<1, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<1, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -466,9 +503,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[2]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[2] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<2, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<2, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<2, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<2, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -476,9 +513,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[3]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[3] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<3, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<3, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<3, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<3, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -488,6 +525,18 @@ namespace quda
       if (arg.shmem) shmem_signal(dim, dir, arg);
 #endif
     }
+
+    template <int n_src_tile = Arg::n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_idx_block_, int parity, int = 0)
+    {
+      int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_;
+      int src_idx = src_idx_block * Arg::n_src_tile;
+      if (src_idx + n_src_tile <= arg.n_src) {
+        apply<n_src_tile>(arg, src_idx, parity);
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<n_src_tile - 1>(arg, src_idx_block, parity);
+      }
+    }
   };
 
   template <typename Arg> struct pack_staggered_shmem {
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 7eafa4a687..ec09ab1622 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Parameter structure for driving the Staggered Dslash operator
   */
-  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_u_,
-            QudaReconstructType reconstruct_l_, bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC>
-  struct StaggeredArg : DslashArg<Float, nDim> {
+  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_u_, QudaReconstructType reconstruct_l_,
+            bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC, int n_src_tile = MAX_MULTI_RHS_TILE>
+  struct StaggeredArg : DslashArg<Float, nDim, n_src_tile> {
     typedef typename mapper<Float>::type real;
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = 1;
@@ -37,7 +37,6 @@ namespace quda
     using GL =
         typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost, use_inphase>::type;
 
-    const int_fastdiv n_src;
     F out[MAX_MULTI_RHS];  /** output vector field */
     F in[MAX_MULTI_RHS];   /** input vector field */
     const Ghost halo_pack; /** accessor for writing the halo */
@@ -57,8 +56,8 @@ namespace quda
     StaggeredArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                  const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a,
                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1,
-                             spin_project, comm_override),
+      DslashArg<Float, nDim, n_src_tile>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true,
+                                         improved_ ? 3 : 1, spin_project, comm_override),
       halo_pack(halo, improved_ ? 3 : 1),
       halo(halo, improved_ ? 3 : 1),
       U(U),
@@ -87,9 +86,9 @@ namespace quda
      @param[in] parity The site parity
      @param[in] x_cb The checkerboarded site index
   */
-  template <int nParity, KernelType kernel_type, typename Coord, typename Arg, typename Vector>
-  __device__ __host__ inline void applyStaggered(Vector &out, const Arg &arg, Coord &coord, int parity, int,
-                                                 int thread_dim, bool &active, int src_idx)
+  template <int nParity, KernelType kernel_type, int n_src_tile, typename Coord, typename Arg, typename Vector>
+  __device__ __host__ inline void applyStaggered(array<Vector, n_src_tile> &out, const Arg &arg, Coord &coord,
+                                                 int parity, int, int thread_dim, bool &active, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
     typedef Matrix<complex<real>, Arg::nColor> Link;
@@ -104,13 +103,20 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
-          Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(U, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in
+              = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(U, in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd_idx = linkIndexP1(coord, arg.dim, d);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
-          Vector in = arg.in[src_idx](fwd_idx, their_spinor_parity);
-          out = mv_add(U, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
+            out[s] = mv_add(U, in, out[s]);
+          }
         }
       }
 
@@ -120,14 +126,20 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace);
           const Link L = arg.L(d, coord.x_cb, parity);
-          const Vector in
-            = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(L, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in
+              = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(L, in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd3_idx = linkIndexP3(coord, arg.dim, d);
           const Link L = arg.L(d, coord.x_cb, parity);
-          const Vector in = arg.in[src_idx](fwd3_idx, their_spinor_parity);
-          out = mv_add(L, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
+            out[s] = mv_add(L, in, out[s]);
+          }
         }
       }
 
@@ -140,15 +152,22 @@ namespace quda
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2;
           const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
             arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-          Vector in = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(conj(U), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in
+              = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(conj(U), -in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back_idx = linkIndexM1(coord, arg.dim, d);
           const int gauge_idx = back_idx;
           const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
             arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-          Vector in = arg.in[src_idx](back_idx, their_spinor_parity);
-          out = mv_add(conj(U), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
+            out[s] = mv_add(conj(U), -in, out[s]);
+          }
         }
       }
 
@@ -159,15 +178,21 @@ namespace quda
           // when updating replace arg.nFace with 1 here
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1);
           const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
-          const Vector in
-            = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(conj(L), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in
+              = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(conj(L), -in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back3_idx = linkIndexM3(coord, arg.dim, d);
           const int gauge_idx = back3_idx;
           const Link L = arg.L(d, gauge_idx, 1 - parity);
-          const Vector in = arg.in[src_idx](back3_idx, their_spinor_parity);
-          out = mv_add(conj(L), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
+            out[s] = mv_add(conj(L), -in, out[s]);
+          }
         }
       }
     } // nDim
@@ -181,8 +206,8 @@ namespace quda
     constexpr staggered(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
-    template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity)
+    template <KernelType mykernel_type, int n_src_tile>
+    __device__ __host__ __forceinline__ void apply(int idx, int src_idx, int parity)
     {
       using real = typename mapper<typename Arg::Float>::type;
       using Vector = ColorSpinor<real, Arg::nColor, 1>;
@@ -195,20 +220,41 @@ namespace quda
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
 
-      Vector out;
+      array<Vector, n_src_tile> out;
+      applyStaggered<nParity, mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      applyStaggered<nParity, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
-
-      out *= arg.dagger_scale;
+#pragma unroll
+      for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
 
       if (xpay && mykernel_type == INTERIOR_KERNEL) {
-        Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
-        out = arg.a * x - out;
+#pragma unroll
+        for (auto s = 0; s < n_src_tile; s++) {
+          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = arg.a * x - out[s];
+        }
       } else if (mykernel_type != INTERIOR_KERNEL) {
-        Vector x = arg.out[src_idx](coord.x_cb, my_spinor_parity);
-        out = x + (xpay ? -out : out);
+#pragma unroll
+        for (auto s = 0; s < n_src_tile; s++) {
+          Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = x + (xpay ? -out[s] : out[s]);
+        }
+      }
+      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) {
+#pragma unroll
+        for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
+      }
+    }
+
+    template <KernelType mykernel_type = kernel_type, int n_src_tile = Arg::n_src_tile>
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block_, int parity)
+    {
+      int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_;
+      int src_idx = src_idx_block * Arg::n_src_tile;
+      if (src_idx + n_src_tile <= arg.n_src) {
+        apply<mykernel_type, n_src_tile>(idx, src_idx, parity);
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<mykernel_type, n_src_tile - 1>(idx, src_idx_block, parity);
       }
-      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
     }
   };
 
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 514c4df957..584999d52f 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -34,7 +34,6 @@ namespace quda
 
     typedef typename mapper<Float>::type real;
 
-    const int_fastdiv n_src;
     F out[MAX_MULTI_RHS]; /** output vector field set */
     F in[MAX_MULTI_RHS];  /** input vector field set */
     F x[MAX_MULTI_RHS];   /** input vector set when doing xpay */
diff --git a/include/kernels/evec_project.cuh b/include/kernels/evec_project.cuh
index 7bea55d908..add4b7caba 100644
--- a/include/kernels/evec_project.cuh
+++ b/include/kernels/evec_project.cuh
@@ -47,25 +47,11 @@ namespace quda {
     __device__ __host__ spinor_array init() const { return spinor_array(); }
   };
   
- template <int reduction_dim, class T> __device__ int idx_from_t_xyz(int t, int xyz, T X[4])
-  {
-    int x[4];
-#pragma unroll
-    for (int d = 0; d < 4; d++) {
-      if (d != reduction_dim) {
-	x[d] = xyz % X[d];
-	xyz = xyz / X[d];
-      }
-    }    
-    x[reduction_dim] = t;    
-    return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);
-  }
-  
   template <typename Arg> struct EvecProjection : plus<spinor_array> {
     using reduce_t = spinor_array;
-    using plus<reduce_t>::operator();    
-    static constexpr int reduce_block_dim = 1; // only doing a reduct in the x thread dimension
-    
+    using plus<reduce_t>::operator();
+    static constexpr int reduce_block_dim = 1; // only doing a reduce in the x thread dimension
+
     const Arg &arg;
     constexpr EvecProjection(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index d0846de352..a3c2b1b377 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -44,8 +44,8 @@ namespace quda
 
     LaplaceArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                const ColorSpinorField &halo, const GaugeField &U, int dir, double a, double b,
-               cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override),
+               cvector_ref<const ColorSpinorField> &x, int parity, const int *comm_override) :
+      DslashArg<Float, nDim>(out, in, halo, U, x, parity, false, a != 0.0 ? true : false, 1, false, comm_override),
       halo_pack(halo),
       halo(halo),
       U(U),
@@ -74,7 +74,7 @@ namespace quda
      @param[in] thread_dim Which dimension this thread corresponds to (fused exterior only)
 
   */
-  template <int nParity, bool dagger, KernelType kernel_type, int dir, typename Coord, typename Arg, typename Vector>
+  template <int nParity, KernelType kernel_type, int dir, typename Coord, typename Arg, typename Vector>
   __device__ __host__ inline void applyLaplace(Vector &out, Arg &arg, Coord &coord, int parity, int, int thread_dim,
                                                bool &active, int src_idx)
   {
@@ -136,15 +136,16 @@ namespace quda
   }
   
   // out(x) = M*in
-  template <int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg> struct laplace : dslash_default {
+  template <int nParity, bool, bool xpay, KernelType kernel_type, typename Arg> struct laplace : dslash_default {
 
     const Arg &arg;
     constexpr laplace(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity)
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_, int parity)
     {
+      int src_idx = MAX_MULTI_RHS == 1 ? 0 : src_idx_;
       using real = typename mapper<typename Arg::Float>::type;
       using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
 
@@ -163,12 +164,10 @@ namespace quda
       // case 4 is an operator in all x,y,z,t dimensions
       // case 3 is a spatial operator only, the t dimension is omitted.
       switch (arg.dir) {
-      case 3:
-        applyLaplace<nParity, dagger, mykernel_type, 3>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
-        break;
+      case 3: applyLaplace<nParity, mykernel_type, 3>(out, arg, coord, parity, idx, thread_dim, active, src_idx); break;
       case 4:
       default:
-        applyLaplace<nParity, dagger, mykernel_type, -1>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+        applyLaplace<nParity, mykernel_type, -1>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
         break;
       }
 
diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh
index 22323e66a2..85e1790318 100644
--- a/include/kernels/staggered_quark_smearing.cuh
+++ b/include/kernels/staggered_quark_smearing.cuh
@@ -179,9 +179,9 @@ namespace quda
       if (arg.is_t0_kernel) {
         if (arg.t0 < 0) return;
 
-        if (mykernel_type == INTERIOR_KERNEL) {
+        if constexpr (mykernel_type == INTERIOR_KERNEL) {
           idx += arg.t0 * arg.t0_offset;
-        } else if (mykernel_type != EXTERIOR_KERNEL_ALL) {
+        } else if constexpr (mykernel_type != EXTERIOR_KERNEL_ALL) {
           if (idx >= arg.t0_face_size[mykernel_type])
             idx += arg.face_size[mykernel_type] - arg.t0_face_size[mykernel_type];
           idx += arg.t0 * arg.t0_face_offset[mykernel_type];
diff --git a/include/lattice_field.h b/include/lattice_field.h
index 462add3bde..2cf1f6beaa 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -902,6 +902,40 @@ namespace quda {
 
 #define checkPrecision(...) Precision_(__func__, __FILE__, __LINE__, __VA_ARGS__)
 
+  /**
+     @brief Helper function for determining if the color of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @return If color is unique return the number of colors
+   */
+  template <typename T1, typename T2>
+  inline int Color_(const char *func, const char *file, int line, const T1 &a_, const T2 &b_)
+  {
+    const unwrap_t<T1> &a(a_);
+    const unwrap_t<T2> &b(b_);
+    int nColor = 0;
+    if (a.Ncolor() == b.Ncolor())
+      nColor = a.Ncolor();
+    else
+      errorQuda("Color %d %d do not match (%s:%d in %s())", a.Ncolor(), b.Ncolor(), file, line, func);
+    return nColor;
+  }
+
+  /**
+     @brief Helper function for determining if the color of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @param[in] args List of additional fields to check color on
+     @return If colors is unique return the number of colors
+   */
+  template <typename T1, typename T2, typename... Args>
+  inline int Color_(const char *func, const char *file, int line, const T1 &a, const T2 &b, const Args &...args)
+  {
+    return Color_(func, file, line, a, b) & Color_(func, file, line, a, args...);
+  }
+
+#define checkColor(...) Color_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
   /**
      @brief Helper function for determining if the field is in native order
      @param[in] a Input field
diff --git a/include/quda.h b/include/quda.h
index 51d0b98043..1ef14401f3 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -9,6 +9,7 @@
  * as the Fortran interface in lib/quda_fortran.F90.
  */
 
+#include <stdbool.h> // bool support
 #include <enum_quda.h>
 #include <stdio.h> /* for FILE */
 #include <quda_define.h>
@@ -283,8 +284,6 @@ extern "C" {
     double temp;                           /**< The mean temperature of the device for the duration of the solve */
     double clock;                          /**< The mean clock frequency of the device for the duration of the solve */
 
-    QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */
-
     /** Number of steps in s-step algorithms */
     int Nsteps;
 
@@ -496,7 +495,7 @@ extern "C" {
     QudaBoolean preserve_deflation;
 
     /** This is where we store the deflation space.  This will point
-        to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this.  */
+        to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */
     void *preserve_deflation_space;
 
     /** If we restore the deflation space, this boolean indicates
@@ -506,6 +505,10 @@ extern "C" {
         false, but preserve_deflation would be true */
     QudaBoolean preserve_evals;
 
+    /** Whether to use the smeared gauge field for the Dirac operator
+        for whose eigenvalues are are computing. */
+    bool use_smeared_gauge;
+
     /** What type of Dirac operator we are using **/
     /** If !(use_norm_op) && !(use_dagger) use M. **/
     /** If use_dagger, use Mdag **/
@@ -569,6 +572,12 @@ extern "C" {
     /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/
     char QUDA_logfile[512];
 
+    /** The orthogonal direction in the 3D eigensolver **/
+    int ortho_dim;
+
+    /** The size of the orthogonal direction in the 3D eigensolver, local **/
+    int ortho_dim_size_local;
+
     //-------------------------------------------------
 
     // EIG-CG PARAMS
@@ -1488,7 +1497,7 @@ extern "C" {
    * @param inGauge Pointer to the device gauge field (QUDA device field)
    * @param param The parameters of the host and device fields
    */
-  void  saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param);
+  void saveGaugeFieldQuda(void *outGauge, void *inGauge, QudaGaugeParam *param);
 
   /**
    * Reinterpret gauge as a pointer to a GaugeField and call destructor.
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index c17b361b56..a9622c8ea1 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -25,6 +25,12 @@
  */
 #define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@
 
+/**
+ * @def   MAX_MULTI_BLAS_N
+ * @brief This macro sets the register tile size for MRHS kernels
+ */
+#define MAX_MULTI_RHS_TILE @QUDA_MAX_MULTI_RHS_TILE@
+
 #cmakedefine QUDA_HETEROGENEOUS_ATOMIC
 #ifdef QUDA_HETEROGENEOUS_ATOMIC
 /**
@@ -36,6 +42,16 @@
 #undef QUDA_HETEROGENEOUS_ATOMIC
 #endif
 
+#cmakedefine QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+#ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+/**
+ * @def   HETEROGENEOUS_ATOMIC_INF_INIT
+ * @brief This macro sets whether we are using infinity for the signaling sentinel
+ */
+#define HETEROGENEOUS_ATOMIC_INF_INIT
+#undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+#endif
+
 #cmakedefine QUDA_LARGE_KERNEL_ARG
 
 #cmakedefine QUDA_DIRAC_CLOVER_HASENBUSCH
diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h
index da3e178bcf..aceb88c83d 100644
--- a/include/targets/cuda/reduce_helper.h
+++ b/include/targets/cuda/reduce_helper.h
@@ -11,12 +11,17 @@
 #include <cuda/std/climits>
 #include <cuda/std/type_traits>
 #include <cuda/std/limits>
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 using count_t = cuda::atomic<unsigned int, cuda::thread_scope_device>;
 
 namespace quda
 {
 
+// By default we use negative infinity as the sentinel for testing for
+// reduction completion.  On some compilers we may need to use finite
+// numbers, so the alternative approach uses negative zero (set with CMake
+// option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT).
+#ifdef HETEROGENEOUS_ATOMIC_INF_INIT
   /**
      @brief The initialization value we used to check for completion
    */
@@ -28,6 +33,31 @@ namespace quda
   */
   template <typename T> constexpr T terminate_value() { return cuda::std::numeric_limits<T>::infinity(); }
 
+  /**
+     @brief Test if the result is complete (e.g., is not equal to the sentinel)
+   */
+  template <class T> bool is_complete(const T &result) { return result != init_value<T>(); }
+#else
+  /**
+     @brief The initialization value we used to check for completion
+   */
+  template <typename T> constexpr T init_value() { return -static_cast<T>(0.0); }
+
+  /**
+     @brief The termination value we use to prevent a possible hang in
+     case the computed reduction is equal to the initialization
+  */
+  template <typename T> constexpr T terminate_value() { return static_cast<T>(0.0); }
+
+  /**
+     @brief Test if the result is complete (e.g., is not equal to the sentinel)
+   */
+  template <class T> bool is_complete(const T &result)
+  {
+    return !(result == static_cast<T>(0.0) && std::signbit(result));
+  }
+#endif
+
   // declaration of reduce function
   template <typename Reducer, typename Arg, typename T>
   __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0);
@@ -129,7 +159,7 @@ namespace quda
       if (consumed) errorQuda("Cannot call complete more than once for each construction");
 
       for (int i = 0; i < n_reduce * n_item; i++) {
-        while (result_h[i].load(cuda::std::memory_order_relaxed) == init_value<system_atomic_t>()) { }
+        while (!is_complete(result_h[i].load(cuda::std::memory_order_relaxed))) { }
       }
 
       // copy back result element by element and convert if necessary to host reduce type
@@ -169,11 +199,7 @@ namespace quda
     auto tid = target::thread_idx_linear<2>();
 
     if (arg.result_d) { // write to host mapped memory
-#ifdef _NVHPC_CUDA      // WAR for nvc++
-      constexpr bool coalesced_write = false;
-#else
       constexpr bool coalesced_write = true;
-#endif
       if constexpr (coalesced_write) {
         static_assert(n <= device::warp_size(), "reduction array is greater than warp size");
         auto mask = __ballot_sync(0xffffffff, tid < n);
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
deleted file mode 100644
index 1c4d7f3244..0000000000
--- a/include/targets/cuda/thread_array.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#ifndef _NVHPC_CUDA
-
-#include "../generic/thread_array.h"
-
-#else
-
-#include <array.h>
-
-namespace quda
-{
-  template <typename T, int n> struct thread_array : array<T, n> {
-    static constexpr unsigned int shared_mem_size(dim3 block) { return 0; }
-  };
-} // namespace quda
-
-#endif
diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h
deleted file mode 100644
index 24f986cc26..0000000000
--- a/include/targets/hip/thread_array.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../generic/thread_array.h"
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 03b9b148ee..b49ccc9753 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -43,6 +43,14 @@ namespace quda {
    */
   const std::map<TuneKey, TuneParam> &getTuneCache();
 
+  /**
+     @brief Unify all instances of the tunecache across ranks.  This
+     is called after returning to a global communicator.
+     @param[in] rank_list The list of ranks from whose tunecaches we
+     want to merge to form the global tunecache
+   */
+  void joinTuneCache(const std::vector<int> &rank_list);
+
   /**
      @brief Return a string encoding the QUDA version
    */
@@ -61,7 +69,7 @@ namespace quda {
 
   class Tunable {
 
-    friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity);
+    friend TuneParam tuneLaunch(Tunable &, bool, QudaVerbosity);
     static inline uint64_t _flops_global = 0;
     static inline uint64_t _bytes_global = 0;
 
@@ -417,7 +425,7 @@ namespace quda {
    * @param[in] verbosity What verbosity to use during tuning?
    * @return The tuned launch parameters
    */
-  TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled = getTuning(), QudaVerbosity verbosity = getVerbosity());
+  TuneParam tuneLaunch(Tunable &tunable, bool enabled = getTuning(), QudaVerbosity verbosity = getVerbosity());
 
   /**
    * @brief Post an event in the trace, recording where it was posted
diff --git a/include/util_quda.h b/include/util_quda.h
index 3d68fb5a2e..031f878762 100644
--- a/include/util_quda.h
+++ b/include/util_quda.h
@@ -20,7 +20,25 @@ namespace quda
    @brief Query whether autotuning is enabled or not.  Default is enabled but can be overridden by setting QUDA_ENABLE_TUNING=0.
    @return If autotuning is enabled
  */
-QudaTune getTuning();
+bool getTuning();
+
+/**
+   @brief Set the tuning state
+   @param[in] tuning New tuning state
+ */
+void setTuning(bool tuning);
+
+/**
+   @brief Push a new tuning state, and back up the present one on the
+   stack.
+   @param[in] tune New tuning state
+ */
+void pushTuning(bool tune);
+
+/**
+   @brief Pop the present tuning state and restore what is on the stack.
+ */
+void popTuning();
 
 QudaVerbosity getVerbosity();
 char *getOutputPrefix();
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 017341adc5..d88ba6a239 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -20,8 +20,9 @@ set (QUDA_OBJS
   solve.cpp monitor.cpp dirac_coarse.cpp dslash_coarse.cpp
   coarse_op.cpp coarsecoarse_op.cpp
   coarse_op_preconditioned.cpp staggered_coarse_op.cpp
-  eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp vector_io.cpp
-  eigensolve_quda.cpp quda_arpack_interface.cpp
+  eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp
+  eig_trlm_3d.cpp blas_3d.cu
+  vector_io.cpp eigensolve_quda.cpp quda_arpack_interface.cpp
   multigrid.cpp transfer.cpp block_orthogonalize.cpp
   prolongator.cpp restrictor.cpp staggered_prolong_restrict.cu
   gauge_phase.cu timer.cpp
diff --git a/lib/blas_3d.cu b/lib/blas_3d.cu
new file mode 100644
index 0000000000..acab3da85e
--- /dev/null
+++ b/lib/blas_3d.cu
@@ -0,0 +1,246 @@
+#include <color_spinor_field.h>
+#include <kernels/blas_3d.cuh>
+#include <tunable_nd.h>
+#include <tunable_reduction.h>
+#include <blas_3d.h>
+#include <instantiate.h>
+
+namespace quda
+{
+
+  namespace blas3d
+  {
+
+    template <typename Float, int nColor> class copy3D : TunableKernel2D
+    {
+      ColorSpinorField &y;
+      ColorSpinorField &x;
+      const int t_slice;
+      const copyType type;
+      unsigned int minThreads() const { return y.VolumeCB(); }
+
+    public:
+      copy3D(ColorSpinorField &y, ColorSpinorField &x, int t_slice, copyType type) :
+        TunableKernel2D(y, y.SiteSubset()), y(y), x(x), t_slice(t_slice), type(type)
+      {
+        // Check slice value
+        if (t_slice < 0 || t_slice >= y.X()[3]) errorQuda("Unexpected slice %d", t_slice);
+
+        strcat(aux, type == copyType::SWAP_3D ? ",swap_3d" : type == copyType::COPY_TO_3D ? ",to_3d" : ",from_3d");
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        copy3dArg<Float, nColor> arg(y, x, t_slice);
+        switch (type) {
+        case copyType::COPY_TO_3D: launch<copyTo3d>(tp, stream, arg); break;
+        case copyType::COPY_FROM_3D: launch<copyFrom3d>(tp, stream, arg); break;
+        case copyType::SWAP_3D: launch<swap3d>(tp, stream, arg); break;
+        default: errorQuda("Unknown 3D copy type");
+        }
+      }
+
+      void preTune()
+      {
+        x.backup();
+        y.backup();
+      }
+      void postTune()
+      {
+        x.restore();
+        y.restore();
+      }
+      long long bytes() const
+      {
+        return (type == copyType::SWAP_3D ? 2 : 1) * (x.Bytes() / x.X(3) + y.Bytes() / y.X(3));
+      }
+    };
+
+    void copy(const int slice, const copyType type, ColorSpinorField &x, ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+      // Check orth dim
+      if (x.X()[3] != 1) errorQuda("Unexpected dimensions in x[3]=%d", x.X()[3]);
+      // We must give a 4D Lattice field as the first argument
+      instantiate<copy3D>(y, x, slice, type);
+    }
+
+    void swap(int slice, ColorSpinorField &x, ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+      instantiate<copy3D>(x, y, slice, copyType::SWAP_3D);
+    }
+
+    template <typename Float, int nColor> class axpby3D : TunableKernel2D
+    {
+      const ColorSpinorField &x;
+      ColorSpinorField &y;
+      const std::vector<double> &a;
+      const std::vector<double> &b;
+      unsigned int minThreads() const override { return x.VolumeCB(); }
+
+    public:
+      axpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector<double> &a, const std::vector<double> &b) :
+        TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream) override
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        launch<axpby3d>(tp, stream, axpby3dArg<Float, nColor>(a, x, b, y));
+      }
+
+      void preTune() override { y.backup(); }
+      void postTune() override { y.restore(); }
+      long long flops() const override { return 6 * x.Volume() * x.Nspin() * x.Ncolor(); }
+      long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); }
+    };
+
+    void axpby(const std::vector<double> &a, const ColorSpinorField &x, const std::vector<double> &b, ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<axpby3D>(x, y, a, b);
+    }
+
+    void ax(const std::vector<double> &a, ColorSpinorField &x)
+    {
+      std::vector<double> zeros(a.size(), 0.0);
+      axpby(a, x, zeros, x);
+    }
+
+    template <typename Float, int nColor> class caxpby3D : TunableKernel2D
+    {
+      const ColorSpinorField &x;
+      ColorSpinorField &y;
+      const std::vector<Complex> &a;
+      const std::vector<Complex> &b;
+      unsigned int minThreads() const override { return x.VolumeCB(); }
+
+    public:
+      caxpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector<Complex> &a,
+               const std::vector<Complex> &b) :
+        TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream) override
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        launch<caxpby3d>(tp, stream, caxpby3dArg<Float, nColor>(a, x, b, y));
+      }
+
+      void preTune() override { y.backup(); }
+      void postTune() override { y.restore(); }
+      long long flops() const override { return 14 * x.Volume() * x.Nspin() * x.Ncolor(); }
+      long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); }
+    };
+
+    void caxpby(const std::vector<Complex> &a, const ColorSpinorField &x, const std::vector<Complex> &b,
+                ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<caxpby3D>(x, y, a, b);
+    }
+
+    template <typename Float, int nColor> class reDotProduct3D : TunableMultiReduction
+    {
+      const ColorSpinorField &x;
+      const ColorSpinorField &y;
+      std::vector<double> &result;
+
+    public:
+      reDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector<double> &result) :
+        TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        reDotProduct3dArg<Float, nColor> arg(x, y);
+        launch<reDotProduct3d>(result, tp, stream, arg);
+      }
+
+      long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 2; }
+      long long bytes() const { return x.Bytes() + y.Bytes(); }
+    };
+
+    void reDotProduct(std::vector<double> &result, const ColorSpinorField &x, const ColorSpinorField &y)
+    {
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (result.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<reDotProduct3D>(x, y, result);
+    }
+
+    template <typename Float, int nColor> class cDotProduct3D : TunableMultiReduction
+    {
+      const ColorSpinorField &x;
+      const ColorSpinorField &y;
+      std::vector<Complex> &result;
+
+    public:
+      cDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector<Complex> &result) :
+        TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        cDotProduct3dArg<Float, nColor> arg(x, y);
+        launch<cDotProduct3d>(result, tp, stream, arg);
+      }
+
+      long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 8; }
+      long long bytes() const { return x.Bytes() + y.Bytes(); }
+    };
+
+    void cDotProduct(std::vector<Complex> &result, const ColorSpinorField &x, const ColorSpinorField &y)
+    {
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (result.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<cDotProduct3D>(x, y, result);
+    }
+
+  } // namespace blas3d
+
+} // namespace quda
diff --git a/lib/check_params.h b/lib/check_params.h
index 068c9a4d3e..07a639df09 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -175,6 +175,7 @@ void printQudaEigParam(QudaEigParam *param) {
   P(preserve_deflation, QUDA_BOOLEAN_FALSE);
   P(preserve_deflation_space, 0);
   P(preserve_evals, QUDA_BOOLEAN_TRUE);
+  P(use_smeared_gauge, false);
   P(use_dagger, QUDA_BOOLEAN_FALSE);
   P(use_norm_op, QUDA_BOOLEAN_FALSE);
   P(compute_svd, QUDA_BOOLEAN_FALSE);
@@ -578,6 +579,7 @@ void printQudaInvertParam(QudaInvertParam *param) {
   // domain decomposition parameters
   //P(inv_type_sloppy, QUDA_INVALID_INVERTER); // disable since invalid means no preconditioner
 #if defined INIT_PARAM
+  P(dslash_type_precondition, QUDA_INVALID_DSLASH);
   P(inv_type_precondition, QUDA_INVALID_INVERTER);
   P(preconditioner, 0);
   P(tol_precondition, INVALID_DOUBLE);
diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index ef370eda18..71664614cf 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -329,7 +329,7 @@ namespace quda {
       printf("rank = %d x = %u, s = %d, { ", comm_rank(), x_cb, s);
       for (int c = 0; c < o.Ncolor(); c++) {
         auto value = complex<double>(o(parity, x_cb, s, c));
-        printf("(%f,%f) ", value.real(), value.imag());
+        printf("(%g,%g) ", value.real(), value.imag());
       }
       printf("}\n");
     }
@@ -378,8 +378,6 @@ namespace quda {
 
   void genericPrintVector(const ColorSpinorField &a, int parity, unsigned int x_cb, int rank)
   {
-    if (rank != comm_rank()) return;
-
     ColorSpinorParam param(a);
     param.location = QUDA_CPU_FIELD_LOCATION;
     param.create = QUDA_COPY_FIELD_CREATE;
@@ -388,6 +386,8 @@ namespace quda {
     std::unique_ptr<ColorSpinorField> clone_a = !host_clone ? nullptr : std::make_unique<ColorSpinorField>(param);
     const ColorSpinorField &a_ = !host_clone ? a : *clone_a.get();
 
+    if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang
+
     switch (a.Precision()) {
     case QUDA_DOUBLE_PRECISION:  genericPrintVector<double>(a_, parity, x_cb); break;
     case QUDA_SINGLE_PRECISION:  genericPrintVector<float>(a_, parity, x_cb); break;
diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp
index 16d6f46c17..8bc24808ea 100644
--- a/lib/communicator_mpi.cpp
+++ b/lib/communicator_mpi.cpp
@@ -52,6 +52,7 @@ namespace quda
     }
 
     comm_init(nDim, commDims, rank_from_coords, map_data);
+
     globalReduce.push(true);
   }
 
@@ -120,6 +121,9 @@ namespace quda
                 grid_size, size);
     }
 
+    // defer handling MPI errors to QUDA
+    MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN);
+
     comm_init_common(ndim, dims, rank_from_coords, map_data);
   }
 
diff --git a/lib/communicator_qmp.cpp b/lib/communicator_qmp.cpp
index 1a4a2a7878..67d4a8694f 100644
--- a/lib/communicator_qmp.cpp
+++ b/lib/communicator_qmp.cpp
@@ -166,6 +166,9 @@ void Communicator::comm_init(int ndim, const int *dims, QudaCommsMap rank_from_c
               grid_size, QMP_comm_get_number_of_nodes(QMP_COMM_HANDLE));
   }
 
+  // defer handling MPI errors to QMP
+  MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN);
+
   comm_init_common(ndim, dims, rank_from_coords, map_data);
 }
 
diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp
index ef5db59a0a..adfd332f2b 100644
--- a/lib/communicator_stack.cpp
+++ b/lib/communicator_stack.cpp
@@ -2,6 +2,7 @@
 #include <map>
 #include <array.h>
 #include <lattice_field.h>
+#include <tune_quda.h>
 
 namespace quda
 {
@@ -49,8 +50,16 @@ namespace quda
   void push_communicator(const CommKey &split_key)
   {
     if (comm_nvshmem_enabled())
-      errorQuda(
-        "Split-grid is currently not supported with NVSHMEM. Please set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM.");
+      errorQuda("Split-grid is currently not supported with NVSHMEM. Set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM.");
+
+    // if reverting to global, we will need to join the tunecaches
+    bool join_tune_cache = split_key == default_comm_key;
+    int local_rank = comm_rank();
+    int local_tune_rank = 0;
+
+    // used to store the size of the tunecache at the point of splitting
+    static size_t tune_cache_size = 0;
+
     auto search = communicator_stack.find(split_key);
     if (search == communicator_stack.end()) {
       communicator_stack.emplace(std::piecewise_construct, std::forward_as_tuple(split_key),
@@ -59,7 +68,38 @@ namespace quda
 
     LatticeField::freeGhostBuffer(); // Destroy the (IPC) Comm buffers with the old communicator.
 
+    auto split_key_old = current_key;
     current_key = split_key;
+
+    // we are returning to global so we need to join any diverged tunecaches
+    if (join_tune_cache) {
+      // has this tunecache been updated?
+      int tune_cache_update = tune_cache_size != getTuneCache().size();
+      // have any of tunecaches across the split grid been updated?
+      comm_allreduce_int(tune_cache_update);
+
+      if (tune_cache_update) {
+        auto num_sub_partition = split_key_old.product(); // the number of caches we need to merge
+        int sub_partition_dims[] = {comm_dim(0) / split_key_old[0], comm_dim(1) / split_key_old[1],
+                                    comm_dim(2) / split_key_old[2], comm_dim(3) / split_key_old[3]};
+        int sub_partition_coords[] = {comm_coord(0) / sub_partition_dims[0], comm_coord(1) / sub_partition_dims[1],
+                                      comm_coord(2) / sub_partition_dims[2], comm_coord(3) / sub_partition_dims[3]};
+        auto sub_partition_idx = sub_partition_coords[split_key_old.n_dim - 1];
+        for (auto d = split_key_old.n_dim - 2; d >= 0; d--)
+          sub_partition_idx = sub_partition_idx * split_key_old[d] + sub_partition_coords[d];
+
+        std::vector<int> global_tune_ranks(num_sub_partition);
+        for (auto i = 0; i < num_sub_partition; i++) {
+          global_tune_ranks[i] = (sub_partition_idx == i && local_tune_rank == local_rank) ? comm_rank() : 0;
+          comm_allreduce_int(global_tune_ranks[i]);
+        }
+        // we now have a list of all the global tune ranks so we can join them
+        joinTuneCache(global_tune_ranks);
+      }
+    } else if (!join_tune_cache) {
+      // record size of tunecache when first splitting the grid
+      tune_cache_size = getTuneCache().size();
+    }
   }
 
 #if defined(QMP_COMMS) || defined(MPI_COMMS)
@@ -70,8 +110,12 @@ namespace quda
 
   int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); }
 
+  int comm_dim_global(int dim) { return get_default_communicator().comm_dim(dim); }
+
   int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); }
 
+  int comm_coord_global(int dim) { return get_default_communicator().comm_coord(dim); }
+
   int comm_rank_from_coords(const int *coords) { return get_current_communicator().comm_rank_from_coords(coords); }
 
   void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, bool user_set_comm_handle,
diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp
index 8ce5acf133..b388d4c001 100644
--- a/lib/dslash_coarse.in.cpp
+++ b/lib/dslash_coarse.in.cpp
@@ -94,9 +94,9 @@ namespace quda
                            IntList<@QUDA_MULTIGRID_NVEC_LIST@>());
       } else {
         constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-        ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder);
-        ColorSpinorField v_inB = create_color_spinor_copy(inB, csOrder);
-        ColorSpinorField v_out = create_color_spinor_copy(out, csOrder);
+        auto v_inA = create_color_spinor_copy(inA, csOrder);
+        auto v_inB = create_color_spinor_copy(inB, csOrder);
+        auto v_out = create_color_spinor_copy(out, csOrder);
 
         if (dslash) { BlockTransposeForward(v_inA, inA); }
         if (clover) { BlockTransposeForward(v_inB, inB); }
diff --git a/lib/dslash_pack2.cu b/lib/dslash_pack2.cu
index 043ea65abc..01b70f3284 100644
--- a/lib/dslash_pack2.cu
+++ b/lib/dslash_pack2.cu
@@ -139,6 +139,10 @@ namespace quda
       strcpy(aux, "policy_kernel,");
       strcat(aux, in.AuxString().c_str());
       setRHSstring(aux, in.size());
+      strcat(aux, ",n_rhs_tile=");
+      char tile_str[16];
+      i32toa(tile_str, pack_tile_size);
+      strcat(aux, tile_str);
       char comm[5];
       for (int i = 0; i < 4; i++) comm[i] = (comm_dim_pack[i] ? '1' : '0');
       comm[4] = '\0';
diff --git a/lib/eig_trlm_3d.cpp b/lib/eig_trlm_3d.cpp
new file mode 100644
index 0000000000..7480a58395
--- /dev/null
+++ b/lib/eig_trlm_3d.cpp
@@ -0,0 +1,670 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+#include <quda_internal.h>
+#include <eigensolve_quda.h>
+#include <vector_io.h>
+#include <qio_field.h>
+#include <color_spinor_field.h>
+#include <random_quda.h>
+#include <blas_quda.h>
+#include <blas_3d.h>
+#include <util_quda.h>
+#include <tune_quda.h>
+#include <eigen_helper.h>
+#include <split_grid.h>
+
+namespace quda
+{
+  // Thick Restarted Lanczos Method constructor
+  TRLM3D::TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param) : EigenSolver(mat, eig_param)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
+
+    ortho_dim = eig_param->ortho_dim;
+    ortho_dim_size = eig_param->ortho_dim_size_local;
+    if (ortho_dim != 3)
+      errorQuda("Only 3D spatial splitting (ortho_dim = 3) is supported, ortho_dim passed = %d", ortho_dim);
+    verbose_rank = comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0;
+
+    // Tridiagonal/Arrow matrices
+    alpha_3D.resize(ortho_dim_size);
+    beta_3D.resize(ortho_dim_size);
+
+    residua_3D.resize(ortho_dim_size);
+    ritz_mat_3D.resize(ortho_dim_size);
+    converged_3D.resize(ortho_dim_size, false);
+    active_3D.resize(ortho_dim_size, false);
+
+    iter_locked_3D.resize(ortho_dim_size, 0);
+    iter_keep_3D.resize(ortho_dim_size, 0);
+    iter_converged_3D.resize(ortho_dim_size, 0);
+
+    num_locked_3D.resize(ortho_dim_size, 0);
+    num_keep_3D.resize(ortho_dim_size, 0);
+    num_converged_3D.resize(ortho_dim_size, 0);
+
+    for (int i = 0; i < ortho_dim_size; i++) {
+      alpha_3D[i].resize(n_kr, 0.0);
+      beta_3D[i].resize(n_kr, 0.0);
+      residua_3D[i].resize(n_kr, 0.0);
+    }
+
+    // 3D thick restart specific checks
+    if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6);
+
+    if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) {
+      errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver");
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
+  }
+
+  void TRLM3D::operator()(std::vector<ColorSpinorField> &kSpace, std::vector<Complex> &evals)
+  {
+    // Create 3-d split communicators
+    CommKey split_key = {1, 1, 1, comm_dim(3)};
+
+    if (!split_key.is_valid()) {
+      errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]);
+    }
+    logQuda(QUDA_DEBUG_VERBOSE, "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n",
+            comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);
+    push_communicator(split_key);
+
+    // Override any user input for block size.
+    block_size = 1;
+
+    // Pre-launch checks and preparation
+    //---------------------------------------------------------------------------
+    queryPrec(kSpace[0].Precision());
+    // Check to see if we are loading eigenvectors
+    if (strcmp(eig_param->vec_infile, "") != 0) {
+      printfQuda("Loading evecs from file name %s\n", eig_param->vec_infile);
+      loadFromFile(kSpace, evals);
+      return;
+    }
+
+    // Check for an initial guess. If none present, populate with rands, then
+    // orthonormalise
+    prepareInitialGuess3D(kSpace, ortho_dim_size);
+
+    // Increase the size of kSpace passed to the function, will be trimmed to
+    // original size before exit.
+    prepareKrylovSpace(kSpace, evals);
+
+    // Check for Chebyshev maximum estimation
+    checkChebyOpMax(kSpace);
+
+    // Convergence and locking criteria
+    std::vector<double> mat_norm_3D(ortho_dim_size, 0.0);
+    double epsilon = setEpsilon(kSpace[0].Precision());
+
+    // Print Eigensolver params
+    printEigensolverSetup();
+    //---------------------------------------------------------------------------
+
+    // Begin TRLM Eigensolver computation
+    //---------------------------------------------------------------------------
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    int t_offset = ortho_dim_size * comm_coord(3);
+
+    // Loop over restart iterations.
+    while (restart_iter < max_restarts && !converged) {
+
+      // Get min step
+      int step_min = *std::min_element(num_locked_3D.begin(), num_locked_3D.end());
+      comm_allreduce_min(step_min);
+
+      for (int step = step_min; step < n_kr; step++) lanczosStep3D(kSpace, step);
+      iter += (n_kr - step_min);
+
+      // The eigenvalues are returned in the alpha array
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+      eigensolveFromArrowMat3D();
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
+      // mat_norm is updated and used for LR
+      for (int t = 0; t < ortho_dim_size; t++)
+        for (int i = num_locked_3D[t]; i < n_kr; i++)
+          if (fabs(alpha_3D[t][i]) > mat_norm_3D[t]) mat_norm_3D[t] = fabs(alpha_3D[t][i]);
+
+      // Lambda that returns mat_norm for LR and returns the relevant alpha
+      // (the corresponding Ritz value) for SR
+      auto check_norm = [&](double sr_norm, int t) -> double {
+        if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG)
+          return mat_norm_3D[t];
+        else
+          return sr_norm;
+      };
+
+      // Locking check
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t]) {
+          iter_locked_3D[t] = 0;
+          for (int i = 0; i < (n_kr - num_locked_3D[t]); i++) {
+            if (residua_3D[t][i + num_locked_3D[t]] < epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) {
+              logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d %d resid=%+.6e condition=%.6e ****\n", t, i,
+                      residua_3D[t][i + num_locked_3D[t]], epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t));
+              iter_locked_3D[t] = i + 1;
+            } else {
+              // Unlikely to find new locked pairs
+              break;
+            }
+          }
+        }
+      }
+
+      // Convergence check
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t]) {
+          iter_converged_3D[t] = iter_locked_3D[t];
+          for (int i = iter_locked_3D[t]; i < n_kr - num_locked_3D[t]; i++) {
+            if (residua_3D[t][i + num_locked_3D[t]] < tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) {
+              logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d %d resid=%+.6e condition=%.6e ****\n", t, i,
+                      residua_3D[t][i + num_locked_3D[t]], tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t));
+              iter_converged_3D[t] = i + 1;
+            } else {
+              // Unlikely to find new converged pairs
+              break;
+            }
+          }
+        }
+      }
+
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t])
+          iter_keep_3D[t]
+            = std::min(iter_converged_3D[t] + (n_kr - num_converged_3D[t]) / 2, n_kr - num_locked_3D[t] - 12);
+      }
+
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+      computeKeptRitz3D(kSpace);
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
+      int min_nconv = n_kr + 1;
+      int min_nlock = n_kr + 1;
+      int min_nkeep = n_kr + 1;
+
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t]) {
+          num_converged_3D[t] = num_locked_3D[t] + iter_converged_3D[t];
+          num_keep_3D[t] = num_locked_3D[t] + iter_keep_3D[t];
+          num_locked_3D[t] += iter_locked_3D[t];
+
+          if (num_converged_3D[t] < min_nconv) min_nconv = num_converged_3D[t];
+          if (num_locked_3D[t] < min_nlock) min_nlock = num_locked_3D[t];
+          if (num_keep_3D[t] < min_nkeep) min_nkeep = num_keep_3D[t];
+
+          // Use printf to get data from t dim only
+          if (getVerbosity() >= QUDA_DEBUG_VERBOSE && verbose_rank) {
+            printf("%04d converged eigenvalues for timeslice %d at restart iter %04d\n", num_converged_3D[t],
+                   t_offset + t, restart_iter + 1);
+            printf("iter Conv[%d] = %d\n", t_offset + t, iter_converged_3D[t]);
+            printf("iter Keep[%d] = %d\n", t_offset + t, iter_keep_3D[t]);
+            printf("iter Lock[%d] = %d\n", t_offset + t, iter_locked_3D[t]);
+            printf("num_converged[%d] = %d\n", t_offset + t, num_converged_3D[t]);
+            printf("num_keep[%d] = %d\n", t_offset + t, num_keep_3D[t]);
+            printf("num_locked[%d] = %d\n", t_offset + t, num_locked_3D[t]);
+            for (int i = 0; i < n_kr; i++) {
+              printf("Ritz[%d][%d] = %.16e residual[%d] = %.16e\n", t_offset + t, i, alpha_3D[t][i], i, residua_3D[t][i]);
+            }
+          }
+        }
+      }
+
+      // Check for convergence
+      bool all_converged = true;
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (num_converged_3D[t] >= n_conv) {
+          converged_3D[t] = true;
+        } else {
+          all_converged = false;
+        }
+      }
+
+      if (getVerbosity() >= QUDA_VERBOSE && verbose_rank) {
+        std::stringstream converged;
+        std::copy(converged_3D.begin(), converged_3D.end(), std::ostream_iterator<int>(converged, ""));
+        printf("iter = %d rank = %d converged = %s min nlock %3d nconv %3d nkeep %3d\n", restart_iter + 1,
+               comm_rank_global(), converged.str().c_str(), min_nlock, min_nconv, min_nkeep);
+      }
+
+      if (all_converged) {
+        reorder3D(kSpace);
+        converged = true;
+      }
+
+      restart_iter++;
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
+    // Post computation report
+    //---------------------------------------------------------------------------
+    if (!converged) {
+      if (eig_param->require_convergence) {
+        errorQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d "
+                  "restart steps. Exiting.",
+                  n_conv, n_ev, n_kr, max_restarts);
+      } else {
+        warningQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d "
+                    "restart steps. Continuing with current Lanczos factorisation.",
+                    n_conv, n_ev, n_kr, max_restarts);
+        // Compute eigenvalues
+        computeEvals(kSpace, evals);
+      }
+    } else {
+      if (verbose_rank) {
+        if (getVerbosity() >= QUDA_SUMMARIZE) {
+          printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n",
+                 comm_rank_global(), n_conv, restart_iter, iter);
+        }
+
+        if (getVerbosity() >= QUDA_VERBOSE) {
+          // Dump all Ritz values and residua if using Chebyshev
+          if (eig_param->use_poly_acc) {
+            for (int t = 0; t < ortho_dim_size; t++) {
+              for (int i = 0; i < n_conv; i++) {
+                printf("RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0,
+                       residua_3D[t][i]);
+              }
+            }
+          }
+        }
+      }
+
+      // Compute eigenvalues
+      computeEvals(kSpace, evals);
+    }
+
+    push_communicator(default_comm_key);
+
+    // ensure all processes have all e-values
+    comm_allreduce_sum(evals);
+
+    // Local clean-up
+    cleanUpEigensolver(kSpace, evals);
+  }
+
+  // Thick Restart 3D Member functions
+  //---------------------------------------------------------------------------
+  void TRLM3D::lanczosStep3D(std::vector<ColorSpinorField> &v, int j)
+  {
+    // Compute r[t] = A[t] * v_j[t] - b_{j-i}[t] * v_{j-1}[t]
+    // r[t] = A[t] * v_j[t]
+
+    // Use this while we have only axpby in place of axpy;
+    std::vector<double> unit(ortho_dim_size, 1.0);
+    std::vector<double> alpha_j(ortho_dim_size, 0.0);
+    std::vector<double> beta_j(ortho_dim_size, 0.0);
+
+    // 3D vectors that hold data for individual t components
+    ColorSpinorParam csParamClone(v[0]);
+    csParamClone.change_dim(ortho_dim, 1);
+    std::vector<ColorSpinorField> vecs_t(1, csParamClone);
+    ColorSpinorField r_t(csParamClone);
+
+    // Identify active 3D slices. The active_3D array
+    // should be modified here only throughout the entire
+    // algorithm.
+    for (int t = 0; t < ortho_dim_size; t++) {
+      // Every element of the active array must be assessed
+      active_3D[t] = (num_keep_3D[t] <= j && !converged_3D[t]) ? true : false;
+    }
+
+    // This will be a blocked operator with no
+    // connections in the ortho_dim (usually t)
+    // hence the 3D sections of each vector
+    // will be independent.
+    chebyOp(r[0], v[j]);
+
+    // a_j[t] = v_j^dag[t] * r[t]
+    blas3d::reDotProduct(alpha_j, v[j], r[0]);
+    for (int t = 0; t < ortho_dim_size; t++) {
+      // Only active problem data is recorded
+      if (active_3D[t]) alpha_3D[t][j] = alpha_j[t];
+    }
+
+    // r[t] = r[t] - a_j[t] * v_j[t]
+    for (int t = 0; t < ortho_dim_size; t++) alpha_j[t] = active_3D[t] ? -alpha_j[t] : 0.0;
+    blas3d::axpby(alpha_j, v[j], unit, r[0]);
+
+    // r[t] = r[t] - b_{j-1}[t] * v_{j-1}[t]
+    // Only orthogonalise active problems
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (active_3D[t]) {
+        int start = (j > num_keep_3D[t]) ? j - 1 : 0;
+        if (j - start > 0) {
+
+          // Ensure we have enough 3D vectors
+          if ((int)vecs_t.size() < j - start) resize(vecs_t, j - start, csParamClone);
+
+          // Copy the 3D data into the 3D vectors, create beta array, and create
+          // pointers to the 3D vectors
+          std::vector<double> beta_t;
+          beta_t.reserve(j - start);
+          // Copy residual
+          blas3d::copy(t, blas3d::copyType::COPY_TO_3D, r_t, r[0]);
+          // Copy vectors
+          for (int i = start; i < j; i++) {
+            blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i - start], v[i]);
+            beta_t.push_back(-beta_3D[t][i]);
+          }
+
+          // r[t] = r[t] - beta[t]{j-1} * v[t]{j-1}
+          blas::block::axpy(beta_t, {vecs_t.begin(), vecs_t.begin() + j - start}, r_t);
+
+          // Copy residual back to 4D vector
+          blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, r_t, r[0]);
+        }
+      }
+    }
+
+    // Orthogonalise r against the Krylov space
+    for (int k = 0; k < 1; k++) blockOrthogonalize3D(v, r, j + 1); //  future work: up to 4 times???
+
+    // b_j[t] = ||r[t]||
+    blas3d::reDotProduct(beta_j, r[0], r[0]);
+    for (int t = 0; t < ortho_dim_size; t++) beta_j[t] = active_3D[t] ? sqrt(beta_j[t]) : 0.0;
+
+    // Prepare next step.
+    // v_{j+1}[t] = r[t] / b_{j}[t]
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (active_3D[t]) {
+        beta_3D[t][j] = beta_j[t];
+        beta_j[t] = 1.0 / beta_j[t];
+      }
+    }
+
+    std::vector<double> c(ortho_dim_size);
+    for (int t = 0; t < ortho_dim_size; t++) c[t] = beta_j[t] == 0.0 ? 1.0 : 0.0;
+    blas3d::axpby(beta_j, r[0], c, v[j + 1]);
+  }
+
+  void TRLM3D::reorder3D(std::vector<ColorSpinorField> &kSpace)
+  {
+    for (int t = 0; t < ortho_dim_size; t++) {
+      int i = 0;
+      if (reverse) {
+        while (i < n_kr) {
+          if ((i == 0) || (alpha_3D[t][i - 1] >= alpha_3D[t][i]))
+            i++;
+          else {
+            std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]);
+            blas3d::swap(t, kSpace[i], kSpace[i - 1]);
+            i--;
+          }
+        }
+      } else {
+        while (i < n_kr) {
+          if ((i == 0) || (alpha_3D[t][i - 1] <= alpha_3D[t][i]))
+            i++;
+          else {
+            std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]);
+            blas3d::swap(t, kSpace[i], kSpace[i - 1]);
+            i--;
+          }
+        }
+      }
+    }
+  }
+
+  void TRLM3D::eigensolveFromArrowMat3D()
+  {
+    getProfile().TPSTART(QUDA_PROFILE_EIGEN);
+
+    // Loop over the 3D problems
+#pragma omp parallel for
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (!converged_3D[t]) {
+
+        int dim = n_kr - num_locked_3D[t];
+        int arrow_pos = num_keep_3D[t] - num_locked_3D[t];
+
+        // Eigen objects
+        MatrixXd A = MatrixXd::Zero(dim, dim);
+        ritz_mat_3D[t].resize(dim * dim);
+        for (int i = 0; i < dim * dim; i++) ritz_mat_3D[t][i] = 0.0;
+
+        // Invert the spectrum due to chebyshev
+        if (reverse) {
+          for (int i = num_locked_3D[t]; i < n_kr - 1; i++) {
+            alpha_3D[t][i] *= -1.0;
+            beta_3D[t][i] *= -1.0;
+          }
+          alpha_3D[t][n_kr - 1] *= -1.0;
+        }
+
+        // Construct arrow mat A_{dim,dim}
+        for (int i = 0; i < dim; i++) {
+
+          // alpha_3D populates the diagonal
+          A(i, i) = alpha_3D[t][i + num_locked_3D[t]];
+        }
+
+        for (int i = 0; i < arrow_pos; i++) {
+
+          // beta_3D populates the arrow
+          A(i, arrow_pos) = beta_3D[t][i + num_locked_3D[t]];
+          A(arrow_pos, i) = beta_3D[t][i + num_locked_3D[t]];
+        }
+
+        for (int i = arrow_pos; i < dim - 1; i++) {
+
+          // beta_3D populates the sub-diagonal
+          A(i, i + 1) = beta_3D[t][i + num_locked_3D[t]];
+          A(i + 1, i) = beta_3D[t][i + num_locked_3D[t]];
+        }
+
+        // Eigensolve the arrow matrix
+        SelfAdjointEigenSolver<MatrixXd> eigensolver;
+        eigensolver.compute(A);
+
+        // repopulate ritz matrix
+        for (int i = 0; i < dim; i++)
+          for (int j = 0; j < dim; j++) ritz_mat_3D[t][dim * i + j] = eigensolver.eigenvectors().col(i)[j];
+
+        for (int i = 0; i < dim; i++) {
+          residua_3D[t][i + num_locked_3D[t]] = fabs(beta_3D[t][n_kr - 1] * eigensolver.eigenvectors().col(i)[dim - 1]);
+          // Update the alpha_3D array
+          alpha_3D[t][i + num_locked_3D[t]] = eigensolver.eigenvalues()[i];
+        }
+
+        // Put spectrum back in order
+        if (reverse) {
+          for (int i = num_locked_3D[t]; i < n_kr; i++) { alpha_3D[t][i] *= -1.0; }
+        }
+      }
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_EIGEN);
+  }
+
+  void TRLM3D::computeKeptRitz3D(std::vector<ColorSpinorField> &kSpace)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    // Multi-BLAS friendly array to store part of Ritz matrix we want
+    std::vector<std::vector<double>> ritz_mat_keep(ortho_dim_size);
+
+    std::vector<ColorSpinorField> vecs_t;
+    std::vector<ColorSpinorField> kSpace_t;
+
+    ColorSpinorParam csParamClone(kSpace[0]);
+    csParamClone.create = QUDA_ZERO_FIELD_CREATE;
+    csParamClone.change_dim(ortho_dim, 1);
+
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (!converged_3D[t]) {
+        int dim = n_kr - num_locked_3D[t];
+        int keep = iter_keep_3D[t];
+
+        ritz_mat_keep[t].resize(dim * keep);
+        for (int j = 0; j < dim; j++) {
+          for (int i = 0; i < keep; i++) { ritz_mat_keep[t][j * keep + i] = ritz_mat_3D[t][i * dim + j]; }
+        }
+
+        // Alias the vectors we wish to keep.
+        vector_ref<ColorSpinorField> vecs_locked(kSpace.begin() + num_locked_3D[t],
+                                                 kSpace.begin() + num_locked_3D[t] + dim);
+
+        // multiBLAS axpy. Create 3D vectors so that we may perform all t independent
+        // vector rotations.
+        vecs_t.reserve(dim);
+        kSpace_t.reserve(keep);
+
+        // Create 3D arrays
+        for (int i = vecs_t.size(); i < dim; i++) vecs_t.push_back(csParamClone);
+        for (int i = kSpace_t.size(); i < keep; i++) kSpace_t.push_back(csParamClone);
+
+        blas::zero(kSpace_t);
+
+        // Copy to data to 3D array, zero out workspace, make pointers
+        for (int i = 0; i < dim; i++) blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i], vecs_locked[i]);
+
+        // Compute the axpy
+        blas::block::axpy(ritz_mat_keep[t], {vecs_t.begin(), vecs_t.begin() + dim},
+                          {kSpace_t.begin(), kSpace_t.begin() + keep});
+
+        // Copy back to the 4D workspace array
+
+        // Copy compressed Krylov
+        for (int i = 0; i < keep; i++) {
+          blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, kSpace_t[i], kSpace[num_locked_3D[t] + i]);
+        }
+
+        // Update residual vector
+        blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[0], kSpace[n_kr]);
+        blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, vecs_t[0], kSpace[num_locked_3D[t] + keep]);
+
+        // Update sub arrow matrix
+        for (int i = 0; i < keep; i++)
+          beta_3D[t][i + num_locked_3D[t]] = beta_3D[t][n_kr - 1] * ritz_mat_3D[t][dim * (i + 1) - 1];
+      }
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+  }
+
+  // Orthogonalise r[t][0:] against V_[t][0:j]
+  void TRLM3D::blockOrthogonalize3D(std::vector<ColorSpinorField> &vecs, std::vector<ColorSpinorField> &rvecs, int j)
+  {
+    for (int i = 0; i < j; i++) {
+      std::vector<Complex> s_t(ortho_dim_size, 0.0);
+      std::vector<Complex> unit_new(ortho_dim_size, {1.0, 0.0});
+
+      // Block dot products stored in s_t.
+      blas3d::cDotProduct(s_t, vecs[i], rvecs[0]);
+      for (int t = 0; t < ortho_dim_size; t++) { s_t[t] *= active_3D[t] ? -1.0 : 0.0; }
+
+      // Block orthogonalise
+      blas3d::caxpby(s_t, vecs[i], unit_new, rvecs[0]);
+    }
+  }
+
+  void TRLM3D::prepareInitialGuess3D(std::vector<ColorSpinorField> &kSpace, int ortho_dim_size)
+  {
+    auto nrm = blas::norm2(kSpace[0]);
+    if (!std::isfinite(nrm) || nrm == 0.0) {
+      RNG rng(kSpace[0], 1234);
+      spinorNoise(kSpace[0], rng, QUDA_NOISE_UNIFORM);
+    }
+
+    std::vector<double> norms(ortho_dim_size, 0.0);
+    std::vector<double> zeros(ortho_dim_size, 0.0);
+    blas3d::reDotProduct(norms, kSpace[0], kSpace[0]);
+    for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]);
+    blas3d::axpby(norms, kSpace[0], zeros, kSpace[0]);
+  }
+
+  double TRLM3D::estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in)
+  {
+    RNG rng(in, 1234);
+    spinorNoise(in, rng, QUDA_NOISE_UNIFORM);
+
+    // Power iteration
+    std::vector<double> norms(ortho_dim_size, 0.0);
+    std::vector<double> zeros(ortho_dim_size, 0.0);
+    for (int i = 0; i < 100; i++) {
+      if ((i + 1) % 10 == 0) {
+        blas3d::reDotProduct(norms, in, in);
+        for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]);
+        blas3d::axpby(norms, in, zeros, in);
+      }
+      mat(out, in);
+      std::swap(out, in);
+    }
+
+    // Compute spectral radius estimate
+    std::vector<double> inner_products(ortho_dim_size, 0.0);
+    blas3d::reDotProduct(inner_products, out, in);
+
+    auto result = *std::max_element(inner_products.begin(), inner_products.end());
+    comm_allreduce_max(result);
+    if (verbose_rank && getVerbosity() >= QUDA_VERBOSE)
+      printf("Chebyshev max = %e (rank = %d)\n", result, comm_rank_global());
+
+    // Increase final result by 10% for safety
+    return result * 1.10;
+  }
+
+  void TRLM3D::computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, int size)
+  {
+    if (size == 0) size = n_conv;
+    if (size > (int)evecs.size())
+      errorQuda("Requesting %d eigenvectors with only storage allocated for %lu", size, evecs.size());
+    if (size > (int)evals.size())
+      errorQuda("Requesting %d eigenvalues with only storage allocated for %lu", size, evals.size());
+
+    ColorSpinorParam csParamClone(evecs[0]);
+    ColorSpinorField tmp(csParamClone);
+
+    std::vector<std::vector<Complex>> evals_t(size, std::vector<Complex>(ortho_dim_size, 0.0));
+    std::vector<double> norms(ortho_dim_size, 0.0);
+    std::vector<Complex> unit(ortho_dim_size, 1.0);
+    std::vector<Complex> n_unit(ortho_dim_size, {-1.0, 0.0});
+
+    for (int i = 0; i < size; i++) {
+      // r = A * v_i
+      mat(tmp, evecs[i]);
+
+      // lambda_i = v_i^dag A v_i / (v_i^dag * v_i)
+      // Block dot products stored in s_t.
+      blas3d::cDotProduct(evals_t[i], evecs[i], tmp);
+      blas3d::reDotProduct(norms, evecs[i], evecs[i]);
+      for (int t = 0; t < ortho_dim_size; t++) evals_t[i][t] /= sqrt(norms[t]);
+
+      // Measure ||lambda_i*v_i - A*v_i||
+      blas3d::caxpby(evals_t[i], evecs[i], n_unit, tmp);
+      blas3d::reDotProduct(norms, tmp, tmp);
+      for (int t = 0; t < ortho_dim_size; t++) residua_3D[t][i] = sqrt(norms[t]);
+    }
+
+    // If size = n_conv, this routine is called post sort
+    if (size == n_conv) {
+      evals.resize(ortho_dim_size * comm_dim_global(ortho_dim) * n_conv, 0.0);
+
+      if (verbose_rank) {
+        int t_offset = ortho_dim_size * comm_coord_global(3);
+        for (int t = 0; t < ortho_dim_size; t++) {
+          for (int i = 0; i < size; i++) {
+
+            // Use printf to get data from t dim only
+            if (getVerbosity() >= QUDA_VERBOSE) {
+              printf("Eval[%02d][%04d] = (%+.16e,%+.16e) residual = %+.16e\n", t_offset + t, i, evals_t[i][t].real(),
+                     evals_t[i][t].imag(), residua_3D[t][i]);
+            }
+
+            // Transfer evals to eval array
+            evals[(t_offset + t) * size + i] = evals_t[i][t];
+          }
+        }
+      }
+    }
+  }
+
+} // namespace quda
diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp
index 8130897cbb..52f4fd1100 100644
--- a/lib/eigensolve_quda.cpp
+++ b/lib/eigensolve_quda.cpp
@@ -104,6 +104,10 @@ namespace quda
       logQuda(QUDA_VERBOSE, "Creating TR Lanczos eigensolver\n");
       eig_solver = new TRLM(mat, eig_param);
       break;
+    case QUDA_EIG_TR_LANCZOS_3D:
+      logQuda(QUDA_VERBOSE, "Creating TR Lanczos 3-d eigensolver\n");
+      eig_solver = new TRLM3D(mat, eig_param);
+      break;
     case QUDA_EIG_BLK_TR_LANCZOS:
       logQuda(QUDA_VERBOSE, "Creating Block TR Lanczos eigensolver\n");
       eig_solver = new BLKTRLM(mat, eig_param);
@@ -236,7 +240,7 @@ namespace quda
     int n_eig = n_conv;
     if (compute_svd) n_eig *= 2;
     kSpace.resize(n_eig);
-    evals.resize(n_conv);
+    if (eig_param->eig_type != QUDA_EIG_TR_LANCZOS_3D) evals.resize(n_conv);
 
     // Only save if outfile is defined
     if (strcmp(eig_param->vec_outfile, "") != 0) {
@@ -580,6 +584,7 @@ namespace quda
   void EigenSolver::computeEvals(std::vector<ColorSpinorField> &evecs,
                                  std::vector<Complex> &evals, int size)
   {
+    if (size == 0) size = n_conv;
     auto batch_size = eig_param->compute_evals_batch_size;
 
     if (size > static_cast<int>(evecs.size()))
@@ -683,7 +688,7 @@ namespace quda
       case QUDA_SPECTRUM_SR_EIG: printfQuda("'SR' -> sort with real(x) in increasing algebraic order, smallest first.\n"); break;
       case QUDA_SPECTRUM_LI_EIG: printfQuda("'LI' -> sort with imag(x) in decreasing algebraic order, largest first.\n"); break;
       case QUDA_SPECTRUM_SI_EIG: printfQuda("'SI' -> sort with imag(x) in increasing algebraic order, smallest first\n"); break;
-      default: errorQuda("Unkown spectrum type requested: %d", spec_type);
+      default: errorQuda("Unknown spectrum type requested: %d", spec_type);
       }
     }
 
diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp
index 5ea09e3059..7283bf39d0 100644
--- a/lib/field_cache.cpp
+++ b/lib/field_cache.cpp
@@ -35,6 +35,21 @@ namespace quda {
     }
   }
 
+  template <typename T> FieldTmp<T>::FieldTmp(typename T::param_type param)
+  {
+    param.create = QUDA_REFERENCE_FIELD_CREATE;
+    key = FieldKey(T(param));
+
+    auto it = cache.find(key);
+    if (it != cache.end() && it->second.size()) { // found an entry
+      tmp = std::move(it->second.top());
+      it->second.pop(); // pop the defunct object
+    } else {            // no entry found, we must allocate a new field
+      param.create = QUDA_ZERO_FIELD_CREATE;
+      tmp = T(param);
+    }
+  }
+
   template <typename T> FieldTmp<T>::~FieldTmp()
   {
     // don't cache the field if it's empty (e.g., has been moved)
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 20cf7621ac..40a1351656 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -1355,7 +1355,7 @@ namespace quda {
       qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost);
     } else {
       if (is_pointer_array(order)) {
-        char *dst_buffer = reinterpret_cast<char *>(buffer);
+        char *dst_buffer = static_cast<char *>(buffer);
         for (int d = 0; d < site_dim; d++) {
           std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim);
         }
@@ -1375,7 +1375,7 @@ namespace quda {
       qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice);
     } else {
       if (is_pointer_array(order)) {
-        const char *dst_buffer = reinterpret_cast<const char *>(buffer);
+        const char *dst_buffer = static_cast<const char *>(buffer);
         for (int d = 0; d < site_dim; d++) {
           std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim);
         }
@@ -1389,4 +1389,9 @@ namespace quda {
     }
   }
 
+  void GaugeField::PrintMatrix(int dim, int parity, unsigned int x_cb, int rank) const
+  {
+    genericPrintMatrix(*this, dim, parity, x_cb, rank);
+  }
+
 } // namespace quda
diff --git a/lib/gauge_laplace.cpp b/lib/gauge_laplace.cpp
index 00843bb272..752670b676 100644
--- a/lib/gauge_laplace.cpp
+++ b/lib/gauge_laplace.cpp
@@ -29,7 +29,7 @@ namespace quda {
       comm_dim[i] = comm_dim_partitioned(i);
       if (laplace3D == i) comm_dim[i] = 0;
     }
-    ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, dagger, comm_dim, profile);
+    ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, comm_dim, profile);
   }
 
   void GaugeLaplace::DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -43,7 +43,7 @@ namespace quda {
       comm_dim[i] = comm_dim_partitioned(i);
       if (laplace3D == i) comm_dim[i] = 0;
     }
-    ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, dagger, comm_dim, profile);
+    ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, comm_dim, profile);
   }
 
   void GaugeLaplace::M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
@@ -79,7 +79,11 @@ namespace quda {
     // do nothing
   }
 
-  GaugeLaplacePC::GaugeLaplacePC(const DiracParam &param) : GaugeLaplace(param) { }
+  GaugeLaplacePC::GaugeLaplacePC(const DiracParam &param) : GaugeLaplace(param)
+  {
+    for (auto i = 0; i < 4; i++)
+      if (laplace3D == i) errorQuda("Cannot use 3-d operator with e/o preconditioning");
+  }
 
   GaugeLaplacePC::GaugeLaplacePC(const GaugeLaplacePC &dirac) : GaugeLaplace(dirac) { }
 
diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu
index 0da4412d17..2cc755bc81 100644
--- a/lib/gauge_norm.in.cu
+++ b/lib/gauge_norm.in.cu
@@ -1,5 +1,6 @@
 #include <gauge_field_order.h>
 #include <instantiate.h>
+#include <memory>
 
 namespace quda {
 
@@ -61,7 +62,7 @@ namespace quda {
       norm_ = norm<T, fixed, 2 * nColor>(u, d, type); // factor of two to account for spin with MG fields
     } else {
       if constexpr (sizeof...(N) > 0) {
-        norm_ = norm<T, fixed>(u, d, type, IntList<N...>());        
+        norm_ = norm<T, fixed>(u, d, type, IntList<N...>());
       } else {
         errorQuda("Nc = %d has not been instantiated", u.Ncolor());
       }
@@ -108,4 +109,76 @@ namespace quda {
     return nrm;
   }
 
+  template <class Order> void print_matrix(const Order &o, int d, int parity, unsigned int x_cb)
+  {
+    for (int r = 0; r < o.Ncolor(); r++) {
+      printf("rank %d parity %d x %u row %d", comm_rank(), parity, x_cb, r);
+      for (int c = 0; c < o.Ncolor(); c++) {
+        auto value = complex<double>(o(d, parity, x_cb, r, c));
+        printf(" (%g,%g)", value.real(), value.imag());
+      }
+      printf("\n");
+    }
+  }
+
+  template <typename Float, int nColor>
+  void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb)
+  {
+    switch (a.FieldOrder()) {
+    case QUDA_FLOAT2_GAUGE_ORDER:
+      print_matrix(FieldOrder<double, nColor, 1, QUDA_FLOAT2_GAUGE_ORDER, true, Float>(a), d, parity, x_cb);
+      break;
+    case QUDA_QDP_GAUGE_ORDER:
+      print_matrix(FieldOrder<double, nColor, 1, QUDA_QDP_GAUGE_ORDER, true, Float>(a), d, parity, x_cb);
+      break;
+    case QUDA_MILC_GAUGE_ORDER:
+      print_matrix(FieldOrder<double, nColor, 1, QUDA_MILC_GAUGE_ORDER, true, Float>(a), d, parity, x_cb);
+      break;
+    default: errorQuda("Unsupported field order %d", a.FieldOrder());
+    }
+  }
+
+  template <typename Float, int nColor, int... N>
+  void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, IntList<nColor, N...>)
+  {
+    if (a.Ncolor() == nColor) {
+      genericPrintMatrix<Float, nColor>(a, d, parity, x_cb);
+    } else {
+      if constexpr (sizeof...(N) > 0) {
+        genericPrintMatrix<Float, N...>(a, d, parity, x_cb, IntList<N...>());
+      } else {
+        errorQuda("Not supported Ncolor = %d", a.Ncolor());
+      }
+    }
+  }
+
+  template <typename Float> void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb)
+  {
+    genericPrintMatrix<Float>(a, d, parity, x_cb, IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@>());
+  }
+
+  void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, int rank)
+  {
+    GaugeFieldParam param(a);
+    param.field = const_cast<GaugeField *>(&a);
+    param.location = QUDA_CPU_FIELD_LOCATION;
+    param.create = QUDA_COPY_FIELD_CREATE;
+    // if field is a pinned device field then we need to clone it on the host
+    bool host_clone
+      = (a.Location() == QUDA_CUDA_FIELD_LOCATION && a.MemType() == QUDA_MEMORY_DEVICE && !use_managed_memory()) ? true :
+                                                                                                                   false;
+    std::unique_ptr<GaugeField> clone_a = !host_clone ? nullptr : std::make_unique<GaugeField>(param);
+    const GaugeField &a_ = !host_clone ? a : *clone_a.get();
+
+    if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang
+
+    switch (a.Precision()) {
+    case QUDA_DOUBLE_PRECISION: genericPrintMatrix<double>(a_, d, parity, x_cb); break;
+    case QUDA_SINGLE_PRECISION: genericPrintMatrix<float>(a_, d, parity, x_cb); break;
+    case QUDA_HALF_PRECISION: genericPrintMatrix<short>(a_, d, parity, x_cb); break;
+    case QUDA_QUARTER_PRECISION: genericPrintMatrix<int8_t>(a_, d, parity, x_cb); break;
+    default: errorQuda("Precision %d not implemented", a.Precision());
+    }
+  }
+
 } // namespace quda
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index e5b678acfd..6fd6382488 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -1434,7 +1434,7 @@ void endQuda(void)
 
 namespace quda {
 
-  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
   {
     double kappa = inv_param->kappa;
     if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
@@ -1549,7 +1549,7 @@ namespace quda {
     diracParam.use_mobius_fused_kernel = inv_param->use_mobius_fused_kernel;
   }
 
-  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1567,7 +1567,7 @@ namespace quda {
                 inv_param->cuda_prec_sloppy);
   }
 
-  void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1586,7 +1586,7 @@ namespace quda {
   }
 
   // The preconditioner currently mimicks the sloppy operator with no comms
-  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
+  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool comms)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1617,7 +1617,7 @@ namespace quda {
                 inv_param->cuda_prec_precondition);
   }
 
-  void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool use_smeared_gauge)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1625,6 +1625,20 @@ namespace quda {
       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended;
       diracParam.fatGauge = gaugeFatExtended;
       diracParam.longGauge = gaugeLongExtended;
+    } else if (use_smeared_gauge) {
+      if (!gaugeSmeared) errorQuda("No smeared gauge field present");
+      if (inv_param->dslash_type == QUDA_LAPLACE_DSLASH) {
+        if (gaugeSmeared->GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED) {
+          GaugeFieldParam gauge_param(*gaugePrecise);
+          GaugeField gaugeEig(gauge_param);
+          copyExtendedGauge(gaugeEig, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
+          gaugeEig.exchangeGhost();
+          std::swap(gaugeEig, *gaugeSmeared);
+        }
+        diracParam.gauge = gaugeSmeared;
+      } else {
+        errorQuda("Smeared gauge field not supported for operator %d", inv_param->dslash_type);
+      }
     } else {
       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatEigensolver : gaugeEigensolver;
       diracParam.fatGauge = gaugeFatEigensolver;
@@ -1646,7 +1660,7 @@ namespace quda {
                 inv_param->cuda_prec_eigensolver);
   }
 
-  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
+  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, bool pc_solve)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
@@ -1663,8 +1677,7 @@ namespace quda {
     dPre = Dirac::create(diracPreParam);
   }
 
-  void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param,
-                             const bool pc_solve)
+  void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, bool pc_solve)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
@@ -1684,8 +1697,8 @@ namespace quda {
     dRef = Dirac::create(diracRefParam);
   }
 
-  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param,
-                          const bool pc_solve)
+  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param, bool pc_solve,
+                          bool use_smeared_gauge)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
@@ -1696,7 +1709,7 @@ namespace quda {
     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
     bool pre_comms_flag = (param.schwarz_type != QUDA_INVALID_SCHWARZ) ? false : true;
     setDiracPreParam(diracPreParam, &param, pc_solve, pre_comms_flag);
-    setDiracEigParam(diracEigParam, &param, pc_solve);
+    setDiracEigParam(diracEigParam, &param, pc_solve, use_smeared_gauge);
 
     d = Dirac::create(diracParam); // create the Dirac operator
     dSloppy = Dirac::create(diracSloppyParam);
@@ -2392,14 +2405,17 @@ void checkClover(QudaInvertParam *param) {
 
 quda::GaugeField *checkGauge(QudaInvertParam *param)
 {
-  quda::GaugeField *cudaGauge = nullptr;
-  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
-    if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
+  quda::GaugeField *U = param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise :
+                                                                   gaugePrecise;
 
-    if (param->cuda_prec != gaugePrecise->Precision()) {
-      errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());
-    }
+  if (U == nullptr)
+    errorQuda("Precise gauge %sfield doesn't exist", param->dslash_type == QUDA_ASQTAD_DSLASH ? "fat " : "");
+
+  if (param->cuda_prec != U->Precision()) {
+    errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, U->Precision());
+  }
 
+  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
     if (param->cuda_prec_sloppy != gaugeSloppy->Precision()
         || param->cuda_prec_precondition != gaugePrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision()
@@ -2416,18 +2432,10 @@ quda::GaugeField *checkGauge(QudaInvertParam *param)
     if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");
     if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");
     if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist");
-    if (param->overlap) {
-      if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
-    }
-    cudaGauge = gaugePrecise;
+    if (param->overlap && gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
   } else {
-    if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist");
     if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");
 
-    if (param->cuda_prec != gaugeFatPrecise->Precision()) {
-      errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision());
-    }
-
     if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision()
         || param->cuda_prec_precondition != gaugeFatPrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeFatRefinement->Precision()
@@ -2450,23 +2458,18 @@ quda::GaugeField *checkGauge(QudaInvertParam *param)
     if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist");
     if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist");
     if (gaugeFatEigensolver == nullptr) errorQuda("Eigensolver gauge fat field doesn't exist");
-    if (param->overlap) {
-      if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");
-    }
+    if (param->overlap && gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");
 
     if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist");
     if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist");
     if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist");
     if (gaugeLongEigensolver == nullptr) errorQuda("Eigensolver gauge long field doesn't exist");
-    if (param->overlap) {
-      if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");
-    }
-    cudaGauge = gaugeFatPrecise;
+    if (param->overlap && gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");
   }
 
   checkClover(param);
 
-  return cudaGauge;
+  return U;
 }
 
 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
@@ -2582,7 +2585,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
 
   // Create the dirac operator with a sloppy and a precon.
   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
-  createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve);
+  createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve, eig_param->use_smeared_gauge);
   Dirac &dirac = *dEig;
   //------------------------------------------------------
 
@@ -2679,7 +2682,8 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   // Transfer Eigenpairs back to host if using GPU eigensolver. The copy
   // will automatically rotate from device UKQCD gamma basis to the
   // host side gamma basis.
-  for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); }
+  memcpy(host_evals, evals.data(), sizeof(Complex) * evals.size());
+
   if (!(eig_param->arpack_check)) {
     for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i];
   }
@@ -4256,7 +4260,6 @@ void computeHISQForceQuda(void* const milc_momentum,
 
   using namespace quda;
   using namespace quda::fermion_force;
-  if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);
 
   {
     // default settings for the unitarization
@@ -4399,7 +4402,6 @@ void computeHISQForceQuda(void* const milc_momentum,
   GaugeFieldParam param(*gParam);
   param.location = QUDA_CPU_FIELD_LOCATION;
   param.create = QUDA_REFERENCE_FIELD_CREATE;
-  param.order = QUDA_MILC_GAUGE_ORDER;
   param.link_type = QUDA_ASQTAD_MOM_LINKS;
   param.reconstruct = QUDA_RECONSTRUCT_10;
   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
@@ -4421,7 +4423,6 @@ void computeHISQForceQuda(void* const milc_momentum,
   GaugeFieldParam wParam(gParam_field);
   wParam.location = QUDA_CPU_FIELD_LOCATION;
   wParam.create = QUDA_REFERENCE_FIELD_CREATE;
-  wParam.order = QUDA_MILC_GAUGE_ORDER;
   wParam.link_type = QUDA_GENERAL_LINKS;
   wParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   wParam.gauge = (void *)w_link;
@@ -4980,7 +4981,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
 
   for (unsigned int i = 0; i < n_steps; i++) {
     if (i) in = out;
-    ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, comm_dim, profileWuppertal);
+    ApplyLaplace(out, in, *precise, 3, a, b, in, parity, comm_dim, profileWuppertal);
     logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out));
   }
 
@@ -5126,7 +5127,9 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
 
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
-  logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
+  logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n", 0,
+          obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1],
+          obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge);
 
   // set default dir_ignore = 3 for APE and STOUT for compatibility
   int dir_ignore = smear_param->dir_ignore;
@@ -5145,13 +5148,15 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
     case QUDA_GAUGE_SMEAR_HYP:
       HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore);
       break;
-    default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
+    default: errorQuda("Unknown gauge smear type %d", smear_param->smear_type);
     }
 
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++;
       gaugeObservablesQuda(&obs_param[measurement_n]);
-      logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge);
+      logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n",
+              i + 1, obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1],
+              obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge);
     }
   }
 
@@ -5300,7 +5305,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG
     // [4] = Laplace [0]
     copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION);
     precise.exchangeGhost();
-    ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, false, comm_dim, profileGFlow);
+    ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, comm_dim, profileGFlow);
 
     // [0] = [4] = Laplace [0] = Laplace [3]
     f_temp0 = f_temp4;
@@ -5318,7 +5323,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG
     // [4] <- Laplace [1]
     copyExtendedGauge(precise, gout, QUDA_CUDA_FIELD_LOCATION);
     precise.exchangeGhost();
-    ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, false, comm_dim, profileGFlow);
+    ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, comm_dim, profileGFlow);
 
     // [1] <- [4]
     f_temp1 = f_temp4;
@@ -5336,7 +5341,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG
     // [4] <- Laplace [2]
     copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION);
     precise.exchangeGhost();
-    ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, false, comm_dim, profileGFlow);
+    ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, comm_dim, profileGFlow);
 
     // [2] <- [4] = Laplace [2]
     f_temp2 = f_temp4;
@@ -5590,7 +5595,7 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param)
   auto profile = pushProfile(profileGaugeObs);
   checkGaugeObservableParam(param);
 
-  if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field");
+  if (!gaugePrecise) errorQuda("Cannot compute gauge observables as there is no resident gauge field");
 
   GaugeField *gauge = nullptr;
   if (!gaugeSmeared) {
diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp
index 0e86f80a17..a4713a16b5 100644
--- a/lib/inv_mr_quda.cpp
+++ b/lib/inv_mr_quda.cpp
@@ -82,10 +82,10 @@ namespace quda
 
     int iter = 0;
     int step = 0;
-    bool converged = false;
+    bool is_done = false;
 
     PrintStats("MR", iter, r2, b2);
-    while (!converged) {
+    while (!is_done) {
 
       int k = 0;
       vector<double> scale(b.size(), 1.0);
@@ -142,22 +142,23 @@ namespace quda
         commGlobalReductionPop(); // renable global reductions for outer solver
       }
 
+      step++;
+
       // FIXME - add over/under relaxation in outer loop
       bool compute_true_res = param.compute_true_res || param.Nsteps > 1;
       if (compute_true_res) {
         mat(r, x);
         r2 = blas::xmyNorm(b, r);
         for (auto i = 0u; i < b2.size(); i++) param.true_res[i] = sqrt(r2[i] / b2[i]);
-        converged = (step < param.Nsteps && r2 < stop) ? true : false;
-        if (!converged) blas::copy(r_sloppy, r);
+        is_done = (step >= param.Nsteps || r2 < stop);
+        if (!is_done) blas::copy(r_sloppy, r);
         PrintStats("MR (restart)", iter, r2, b2);
       } else {
         blas::ax(scale, r_sloppy);
         r2 = blas::norm2(r_sloppy);
-        converged = (step < param.Nsteps && r2 < stop) ? true : false;
-        if (!converged) blas::copy(r, r_sloppy);
+        is_done = (step >= param.Nsteps || r2 < stop);
+        if (!is_done) blas::copy(r, r_sloppy);
       }
-      step++;
     }
 
     PrintSummary("MR", iter, r2, b2, stopping(param.tol, b2, param.residual_type));
diff --git a/lib/laplace.cu b/lib/laplace.cu
index aab6dd2d8e..5e5c3803eb 100644
--- a/lib/laplace.cu
+++ b/lib/laplace.cu
@@ -151,18 +151,18 @@ namespace quda
 
     LaplaceApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                  cvector_ref<const ColorSpinorField> &x, const GaugeField &U, int dir, double a, double b, int parity,
-                 bool dagger, const int *comm_override, TimeProfile &profile)
+                 const int *comm_override, TimeProfile &profile)
     {
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
       if (in.Nspin() == 1) {
         constexpr int nSpin = 1;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override);
+        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
         Laplace<decltype(arg)> laplace(arg, out, in, halo);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
       } else if (in.Nspin() == 4) {
         constexpr int nSpin = 4;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override);
+        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
         Laplace<decltype(arg)> laplace(arg, out, in, halo);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
       } else {
@@ -175,11 +175,11 @@ namespace quda
   // out(x) = M*in = - a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu) + b*in(x)
   // Omits direction 'dir' from the operator.
   void ApplyLaplace(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
-                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity,
                     const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_LAPLACE_DSLASH>()) {
-      instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, dagger, comm_override, profile);
+      instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, comm_override, profile);
     } else {
       errorQuda("Laplace operator has not been enabled");
     }
diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
index c620aaaa57..dbc03f15cf 100644
--- a/lib/multigrid.cpp
+++ b/lib/multigrid.cpp
@@ -968,7 +968,7 @@ namespace quda
       }
 
       transfer->R(x_coarse[0], tmp2);
-      static_cast<DiracCoarse *>(diracCoarseResidual)->M(r_coarse, tmp_coarse);
+      static_cast<DiracCoarse *>(diracCoarseResidual)->M(r_coarse[0], tmp_coarse);
 
 #if 0 // enable to print out emulated and actual coarse-grid operator vectors for debugging
       setOutputPrefix("");
diff --git a/lib/multigrid.in.hpp b/lib/multigrid.in.hpp
index 46da3568e0..55ac01280c 100644
--- a/lib/multigrid.in.hpp
+++ b/lib/multigrid.in.hpp
@@ -38,7 +38,7 @@ namespace quda {
     param.nVec = nVec;
     param.create = QUDA_NULL_FIELD_CREATE;
     param.fieldOrder = order;
-    return ColorSpinorField(param);
+    return getFieldTmp<ColorSpinorField>(param);
   }
 
   inline auto create_color_spinor_copy(const ColorSpinorField &f, QudaFieldOrder order)
diff --git a/lib/prolongator.in.cpp b/lib/prolongator.in.cpp
index e39fa7ee49..2c01ca2eb6 100644
--- a/lib/prolongator.in.cpp
+++ b/lib/prolongator.in.cpp
@@ -28,9 +28,9 @@ namespace quda
         if constexpr (use_mma) {
 
           constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-          ColorSpinorField v_in = create_color_spinor_copy(in, csOrder);
-          ColorSpinorField v_out = create_color_spinor_copy(out, csOrder);
-          ColorSpinorField V = create_color_spinor_copy(v, csOrder);
+          auto v_in = create_color_spinor_copy(in, csOrder);
+          auto v_out = create_color_spinor_copy(out, csOrder);
+          auto V = create_color_spinor_copy(v, csOrder);
           BlockTransposeForward(v_in, in);
           V.copy(v);
 
diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90
index 8eda362fbe..fc7666dd60 100644
--- a/lib/quda_fortran.F90
+++ b/lib/quda_fortran.F90
@@ -224,9 +224,6 @@ module quda_fortran
      real(8) :: temp
      real(8) :: clock
 
-     ! Enable auto-tuning?
-     QudaTune :: tune
-
      ! Number of steps in s-step algorithms
      integer(4) :: nsteps
 
diff --git a/lib/reduce_helper.cu b/lib/reduce_helper.cu
index cf95032656..72d18ba433 100644
--- a/lib/reduce_helper.cu
+++ b/lib/reduce_helper.cu
@@ -44,7 +44,7 @@ namespace quda
       void apply(const qudaStream_t &stream)
       {
         // intentionally do not autotune, since this can be called inside a tuning region
-        auto tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
+        auto tp = tuneLaunch(*this, false, getVerbosity());
         launch_device<init_count>(tp, stream, init_arg<T>(reduce_count, n_reduce));
       }
     };
diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp
index 96fc6f33fa..ca59737cb8 100644
--- a/lib/restrictor.in.cpp
+++ b/lib/restrictor.in.cpp
@@ -26,9 +26,9 @@ namespace quda
       if constexpr (coarseColor >= fineColor) {
         if constexpr (use_mma) {
           constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-          ColorSpinorField v_in = create_color_spinor_copy(in, csOrder);
-          ColorSpinorField v_out = create_color_spinor_copy(out, csOrder);
-          ColorSpinorField V = create_color_spinor_copy(v, csOrder);
+          auto v_in = create_color_spinor_copy(in, csOrder);
+          auto v_out = create_color_spinor_copy(out, csOrder);
+          auto V = create_color_spinor_copy(v, csOrder);
 
           bool from_non_rel = (in.Nspin() == 4) && (in[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS);
           BlockTransposeForward(v_in, in, from_non_rel);
diff --git a/lib/solve.cpp b/lib/solve.cpp
index c9ba9baf21..79fa868633 100644
--- a/lib/solve.cpp
+++ b/lib/solve.cpp
@@ -317,8 +317,8 @@ namespace quda
     getProfile().TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
-  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param,
-                          const bool pc_solve);
+  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param, bool pc_solve,
+                          bool use_smeared_gauge);
 
   extern std::vector<ColorSpinorField> solutionResident;
 
@@ -349,7 +349,8 @@ namespace quda
 
     // Create the dirac operator and operators for sloppy, precondition,
     // and an eigensolver
-    createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve);
+    createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve,
+                       param.eig_param ? static_cast<QudaEigParam *>(param.eig_param)->use_smeared_gauge : false);
 
     // wrap CPU host side pointers
     ColorSpinorParam cpuParam(hp_b[0], param, u.X(), pc_solution, param.input_location);
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 719bfb27e6..234fbf35a3 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -103,8 +103,11 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV
 endif()
 cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON
                        "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
+cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinel for heterogeneous atomic support?" ON
+                       "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
 
 mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC)
+mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT)
 mark_as_advanced(QUDA_JITIFY)
 mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM)
 mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM_TAR)
diff --git a/lib/targets/hip/target_hip.cmake b/lib/targets/hip/target_hip.cmake
index c8f8604040..5c69a52384 100644
--- a/lib/targets/hip/target_hip.cmake
+++ b/lib/targets/hip/target_hip.cmake
@@ -54,9 +54,15 @@ set(CMAKE_HIP_FLAGS_RELEASE
 set(CMAKE_HIP_FLAGS_HOSTDEBUG
     "-g"
     CACHE STRING "Flags used by the C++ compiler during host-debug builds.")
-set(CMAKE_HIP_FLAGS_DEBUG
-    "-g -G"
-    CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.")
+if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+    set(CMAKE_HIP_FLAGS_DEBUG
+        "-g -G"
+        CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for NVIDIA).")
+else()
+    set(CMAKE_HIP_FLAGS_DEBUG
+        "-g"
+        CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for Clang).")
+endif()
 set(CMAKE_HIP_FLAGS_SANITIZE
     "-g "
     CACHE STRING "Flags used by the C++ compiler during sanitizer debug builds.")
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 3bcd7da964..2aefd3988f 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -152,9 +152,37 @@ namespace quda
   const map &getTuneCache() { return tunecache; }
 
   /**
-   * Deserialize tunecache from an istream, useful for reading a file or receiving from other nodes.
+   * @brief Distribute the tunecache from a given rank to all other nodes.
+   * @param[in] root_rank From which global rank to do the broadcast
+   * @param[out] tc Where we wish to receive the tunecache.  This
+   * defaults to the local tunecache.
+   */
+  static void broadcastTuneCache(int32_t root_rank = 0, map &tc_recv = tunecache);
+
+  void joinTuneCache(const std::vector<int> &global_tune_rank)
+  {
+    // vector of the split tunecaches
+    std::vector<map> split_tc(global_tune_rank.size());
+    // broadcast each tunecache to every process
+    for (auto i = 0u; i < global_tune_rank.size(); i++) {
+      broadcastTuneCache(global_tune_rank[i], split_tc[i]);
+      if (comm_rank() == global_tune_rank[i]) split_tc[i] = tunecache;
+      logQuda(QUDA_DEBUG_VERBOSE, "i = %d tune_rank = %d tc size = %lu\n", i, global_tune_rank[i], split_tc[i].size());
+    }
+
+    // now merge the maps
+    tunecache = split_tc[0];
+    for (auto i = 1u; i < global_tune_rank.size(); i++) { tunecache.merge(split_tc[i]); }
+  }
+
+  /**
+   * Deserialize tunecache from an istream, useful for reading a file
+   * or receiving from other nodes.
+   * @param[in] in The stream from which we are deserializing
+   * @param[out] tc The tunecache to which we are deserializing.  This
+   * defaults to the local tunecache.
    */
-  static void deserializeTuneCache(std::istream &in)
+  static void deserializeTuneCache(std::istream &in, map &tc = tunecache)
   {
     std::string line;
     std::stringstream ls;
@@ -185,7 +213,7 @@ namespace quda
       ls.ignore(1);               // throw away tab before comment
       getline(ls, param.comment); // assume anything remaining on the line is a comment
       param.comment += "\n";      // our convention is to include the newline, since ctime() likes to do this
-      tunecache[key] = param;
+      tc[key] = param;
     }
   }
 
@@ -320,30 +348,26 @@ namespace quda
     }
   }
 
-  /**
-   * @brief Distribute the tunecache from a given rank to all other nodes.
-   * @param[in] root_rank From which global rank to do the broadcast
-   */
-  static void broadcastTuneCache(int32_t root_rank = 0)
+  static void broadcastTuneCache(int32_t root_rank, map &tc_recv)
   {
     std::stringstream serialized;
     size_t size;
 
-    if (comm_rank_global() == root_rank) {
+    if (comm_rank() == root_rank) {
       serializeTuneCache(serialized);
       size = serialized.str().length();
     }
-    comm_broadcast_global(&size, sizeof(size_t), root_rank);
+    comm_broadcast(&size, sizeof(size_t), root_rank);
 
     if (size > 0) {
-      if (comm_rank_global() == root_rank) {
-        comm_broadcast_global(const_cast<char *>(serialized.str().c_str()), size, root_rank);
+      if (comm_rank() == root_rank) {
+        comm_broadcast(const_cast<char *>(serialized.str().c_str()), size, root_rank);
       } else {
         std::vector<char> serstr(size + 1);
-        comm_broadcast_global(serstr.data(), size, root_rank);
+        comm_broadcast(serstr.data(), size, root_rank);
         serstr[size] = '\0'; // null-terminate
         serialized.str(serstr.data());
-        deserializeTuneCache(serialized);
+        deserializeTuneCache(serialized, tc_recv);
       }
     }
   }
@@ -353,7 +377,7 @@ namespace quda
    */
   void loadTuneCache()
   {
-    if (getTuning() == QUDA_TUNE_NO) {
+    if (!getTuning()) {
       warningQuda("Autotuning disabled");
       return;
     }
@@ -826,20 +850,20 @@ namespace quda
      */
     void broadcast(int32_t root_rank)
     {
-      size_t size;
+      size_t size = 0;
       std::string serialized;
-      if (comm_rank_global() == root_rank) {
+      if (comm_rank() == root_rank) {
         serialized = serialize();
         size = serialized.length();
       }
-      comm_broadcast_global(&size, sizeof(size_t), root_rank);
+      comm_broadcast(&size, sizeof(size_t), root_rank);
 
       if (size > 0) {
-        if (comm_rank_global() == root_rank) {
-          comm_broadcast_global(const_cast<char *>(serialized.c_str()), size, root_rank);
+        if (comm_rank() == root_rank) {
+          comm_broadcast(const_cast<char *>(serialized.c_str()), size, root_rank);
         } else {
           std::vector<char> serstr(size + 1);
-          comm_broadcast_global(serstr.data(), size, root_rank);
+          comm_broadcast(serstr.data(), size, root_rank);
           serstr[size] = '\0'; // null-terminate
           std::string_view deserialized(serstr.data());
           deserialize(deserialized);
@@ -859,7 +883,7 @@ namespace quda
    * Return the optimal launch parameters for a given kernel, either
    * by retrieving them from tunecache or autotuning on the spot.
    */
-  TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
+  TuneParam tuneLaunch(Tunable &tunable, bool enabled, QudaVerbosity verbosity)
   {
 #ifdef LAUNCH_TIMER
     launchTimer.TPSTART(QUDA_PROFILE_TOTAL);
@@ -880,7 +904,7 @@ namespace quda
     it = tunecache.find(key);
 
     // first check if we have the tuned value and return if we have it
-    if (enabled == QUDA_TUNE_YES && it != tunecache.end()) {
+    if (enabled && it != tunecache.end()) {
 
 #ifdef LAUNCH_TIMER
       launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);
@@ -926,7 +950,7 @@ namespace quda
 
     static TuneParam param;
 
-    if (enabled == QUDA_TUNE_NO) {
+    if (!enabled) {
       TuneParam param_default;
       param_default.aux = make_int4(-1, -1, -1, -1);
       tunable.defaultTuneParam(param_default);
@@ -959,7 +983,7 @@ namespace quda
          - we are tuning an uber kernel
          in which case do the tuning on all ranks since we can't
          guarantee that all nodes are partaking */
-      if (comm_rank_global() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) {
+      if (comm_rank() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) {
         TuneParam best_param;
         TuneCandidates tc(tunable.num_candidates());
         float best_time;
diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp
index e279c0f43e..b033efb20c 100644
--- a/lib/util_quda.cpp
+++ b/lib/util_quda.cpp
@@ -51,17 +51,18 @@ bool getRankVerbosity() {
   return rank_verbosity;
 }
 
+static bool tune = true;
+
 // default has autotuning enabled but can be overridden with the QUDA_ENABLE_TUNING environment variable
-QudaTune getTuning() {
+bool getTuning()
+{
   static bool init = false;
-  static QudaTune tune = QUDA_TUNE_YES;
-
   if (!init) {
     char *enable_tuning = getenv("QUDA_ENABLE_TUNING");
-    if (!enable_tuning || strcmp(enable_tuning,"0")!=0) {
-      tune = QUDA_TUNE_YES;
+    if (!enable_tuning || strcmp(enable_tuning, "0") != 0) {
+      tune = true;
     } else {
-      tune = QUDA_TUNE_NO;
+      tune = false;
     }
     init = true;
   }
@@ -69,6 +70,34 @@ QudaTune getTuning() {
   return tune;
 }
 
+void setTuning(bool tuning)
+{
+  // first check if tuning is disabled, in which case we do nothing
+  static bool init = false;
+  static bool tune_disable = false;
+  if (!init) {
+    char *enable_tuning = getenv("QUDA_ENABLE_TUNING");
+    tune_disable = (enable_tuning && strcmp(enable_tuning, "0") == 0);
+    init = true;
+  }
+  if (!tune_disable) tune = tuning;
+}
+
+static std::stack<bool> tstack;
+
+void pushTuning(bool tuning)
+{
+  tstack.push(getTuning());
+  setTuning(tuning);
+}
+
+void popTuning()
+{
+  if (tstack.empty()) errorQuda("popTuning() called with empty stack");
+  setTuning(tstack.top());
+  tstack.pop();
+}
+
 void setOutputPrefix(const char *prefix)
 {
   strncpy(prefix_, prefix, MAX_PREFIX_SIZE);
@@ -80,7 +109,6 @@ void setOutputFile(FILE *outfile)
   outfile_ = outfile;
 }
 
-
 static std::stack<QudaVerbosity> vstack;
 
 void pushVerbosity(QudaVerbosity verbosity)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b295e00d9c..da3e7a97bf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -110,7 +110,7 @@ if(QUDA_DIRAC_WILSON
   install(TARGETS deflated_invert_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
 
-if(QUDA_DIRAC_STAGGERED)
+if(QUDA_DIRAC_STAGGERED OR QUDA_DIRAC_LAPLACE)
   add_executable(staggered_dslash_test staggered_dslash_test.cpp)
   target_link_libraries(staggered_dslash_test ${TEST_LIBS})
   quda_checkbuildtest(staggered_dslash_test QUDA_BUILD_ALL_TESTS)
@@ -291,6 +291,8 @@ if(QUDA_MPI OR QUDA_QMP)
   message(STATUS "ctest will run on ${QUDA_TEST_NUM_PROCS} processes")
   set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS}
     CACHE STRING "CTest Launcher command for QUDA's tests")
+else()
+  set(QUDA_TEST_NUM_PROCS 1)
 endif()
 
 # BLAS tests
@@ -963,7 +965,7 @@ endif()
       set_tests_properties(dslash_${DIRAC_NAME}_build_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
     endif()
 
-    if(QUDA_LAPLACE)
+    if(QUDA_DIRAC_LAPLACE)
       set(DIRAC_NAME laplace)
       add_test(NAME dslash_${DIRAC_NAME}_mat_policy${pol2}
                COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
@@ -1009,19 +1011,15 @@ if(QUDA_DIRAC_WILSON)
     --enable-testing true
     --gtest_output=xml:invert_test_wilson.xml)
 
-  if(DEFINED ENV{QUDA_ENABLE_TUNING})
-    if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-      add_test(NAME invert_test_splitgrid_wilson
-        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-        --dslash-type wilson --ngcrkrylov 8
-        --dim 2 4 6 8 --niter 1000
-        --nsrc ${QUDA_TEST_NUM_PROCS}
-        --enable-testing true
-        --gtest_output=xml:invert_test_splitgrid_wilson.xml)
+  add_test(NAME invert_test_splitgrid_wilson
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type wilson --ngcrkrylov 8
+    --dim 2 4 6 8 --niter 1000
+    --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS}
+    --enable-testing true
+    --gtest_output=xml:invert_test_splitgrid_wilson.xml)
 
-      set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-    endif()
-  endif()
+  set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
 endif()
   
 if(QUDA_DIRAC_TWISTED_MASS)
@@ -1186,19 +1184,15 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true
       --gtest_output=xml:invert_test_staggered_${prec}.xml)
 
-    if(DEFINED ENV{QUDA_ENABLE_TUNING})
-      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-        add_test(NAME invert_test_splitgrid_staggered_${prec}
-          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-          --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
-          --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-          --nsrc ${QUDA_TEST_NUM_PROCS}
-          --enable-testing true
-          --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
-
-        set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-      endif()
-    endif()
+    add_test(NAME invert_test_splitgrid_staggered_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
+      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+      --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS}
+      --enable-testing true
+      --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
+
+    set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
 
     add_test(NAME invert_test_asqtad_${prec}
       COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
@@ -1207,21 +1201,15 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true --nsrc 4 --nsrc-tile 4
       --gtest_output=xml:invert_test_asqtad_${prec}.xml)
 
-    if(DEFINED ENV{QUDA_ENABLE_TUNING})
-      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-        add_test(NAME invert_test_splitgrid_asqtad_${prec}
-          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-          --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
-          --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-          --nsrc ${QUDA_TEST_NUM_PROCS}
-          --enable-testing true
-          --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml)
-
-        set_tests_properties(invert_test_splitgrid_asqtad_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-      endif()
-    endif()
+    add_test(NAME invert_test_splitgrid_asqtad_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
+      --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+      --nsrc ${QUDA_TEST_NUM_PROCS}
+      --enable-testing true
+      --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml)
 
-    if (QUDA_LAPLACE)
+    if (QUDA_DIRAC_LAPLACE)
       add_test(NAME invert_test_laplace_${prec}
         COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
         --dslash-type laplace --ngcrkrylov 8 --compute-fat-long true
@@ -1243,21 +1231,6 @@ if (QUDA_DIRAC_DISTANCE_PRECONDITIONING)
       --distance-pc-alpha0 0.1 --distance-pc-t0 4
       --enable-testing true
       --gtest_output=xml:invert_test_wilson_distance_pc.xml)
-
-    if(DEFINED ENV{QUDA_ENABLE_TUNING})
-      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-        add_test(NAME invert_test_splitgrid_wilson
-          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-          --dslash-type wilson --ngcrkrylov 8
-          --dim 2 4 6 8 --niter 1000
-          --distance-pc-alpha0 0.1 --distance-pc-t0 4
-          --nsrc ${QUDA_TEST_NUM_PROCS}
-          --enable-testing true
-          --gtest_output=xml:invert_test_splitgrid_wilson_distance_pc.xml)
-
-        set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-      endif()
-    endif()
   endif()
 
   if(QUDA_DIRAC_CLOVER)
@@ -1469,7 +1442,7 @@ foreach(prec IN LISTS TEST_PRECS)
         --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
     endif()
 
-    if (QUDA_LAPLACE)
+    if (QUDA_DIRAC_LAPLACE)
       add_test(NAME eigensolve_test_laplace_${prec}
         COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
         --dslash-type laplace --compute-fat-long true
diff --git a/tests/clover_force_test.cpp b/tests/clover_force_test.cpp
index 4a9edb57b2..5847bec40a 100644
--- a/tests/clover_force_test.cpp
+++ b/tests/clover_force_test.cpp
@@ -127,7 +127,7 @@ std::tuple<int, double> clover_force_test(test_t param)
   gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER;
   gauge_param.overwrite_mom = 1;
 
-  if (getTuning() == QUDA_TUNE_YES)
+  if (getTuning())
     computeTMCloverForceQuda(mom.data(), in.data(), in0.data(), coeff.data(), nvector, &gauge_param, &inv_param,
                              detratio);
 
diff --git a/tests/contract_ft_test.cpp b/tests/contract_ft_test.cpp
index cc793ede5b..740adf6cc9 100644
--- a/tests/contract_ft_test.cpp
+++ b/tests/contract_ft_test.cpp
@@ -171,9 +171,9 @@ inline int launch_contract_test(const QudaContractType cType, const std::array<i
 
   fill_buffers<Float, 2>(buffs, X, dof);
 
-  for (int s = 0; s < nprops; ++s, off += spinor_field_floats * sizeof(Float)) {
-    spinorX[s] = (void *)((uintptr_t)buffs[0].data() + off);
-    spinorY[s] = (void *)((uintptr_t)buffs[1].data() + off);
+  for (int s = 0; s < nprops; ++s, off += spinor_field_floats) {
+    spinorX[s] = static_cast<void *>(buffs[0].data() + off);
+    spinorY[s] = static_cast<void *>(buffs[1].data() + off);
   }
   // Perform GPU contraction:
   void *d_result_ = static_cast<void *>(d_result.data());
@@ -181,9 +181,8 @@ inline int launch_contract_test(const QudaContractType cType, const std::array<i
   contractFTQuda(spinorX.data(), spinorY.data(), &d_result_, cType, (void *)(&cs_param), src_colors, X.data(),
                  source_position.data(), n_mom, mom.data(), fft_type.data());
   // Check results:
-  int faults
-    = contractionFT_reference<Float>((Float **)spinorX.data(), (Float **)spinorY.data(), d_result.data(), cType,
-                                     src_colors, X.data(), source_position.data(), n_mom, mom.data(), fft_type.data());
+  int faults = contractionFT_reference<Float>(spinorX.data(), spinorY.data(), d_result.data(), cType, src_colors,
+                                              X.data(), source_position.data(), n_mom, mom.data(), fft_type.data());
 
   return faults;
 }
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index 3d77668d97..5512c3d0a2 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -428,9 +428,8 @@ struct DslashTestWrapper {
         switch (dtest_type) {
         case dslash_test_type::Dslash:
           // My dslash should be the same as the clover dslash
-          for (int i = 0; i < Nsrc; i++)
-            clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger,
-                          inv_param.cpu_prec, gauge_param);
+          clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger,
+                        inv_param.cpu_prec, gauge_param);
           break;
         case dslash_test_type::MatPC:
           // my matpc op
@@ -1062,12 +1061,12 @@ struct DslashTestWrapper {
                  1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min, 2 * ghost_bytes);
 
       ::testing::Test::RecordProperty("Gflops", std::to_string(1.0e-9 * flops / dslash_time.event_time));
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
       ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes);
     }
   }
diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp
index e325511e05..a9561ca2a5 100644
--- a/tests/eigensolve_test.cpp
+++ b/tests/eigensolve_test.cpp
@@ -191,17 +191,14 @@ std::vector<double> eigensolve(test_t test_param)
   // Host side arrays to store the eigenpairs computed by QUDA
   int n_eig = eig_n_conv;
   if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2;
-  std::vector<quda::ColorSpinorField> evecs(n_eig);
   quda::ColorSpinorParam cs_param;
   constructWilsonTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param);
 
   // Void pointers to host side arrays, compatible with the QUDA interface.
   std::vector<void *> host_evecs_ptr(n_eig);
   // Allocate host side memory and pointers
-  for (int i = 0; i < n_eig; i++) {
-    evecs[i] = quda::ColorSpinorField(cs_param);
-    host_evecs_ptr[i] = evecs[i].data();
-  }
+  std::vector<quda::ColorSpinorField> evecs(n_eig, cs_param);
+  for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data();
 
   // Complex eigenvalues
   std::vector<__complex__ double> evals(eig_n_conv);
diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp
index c5197af83c..2994904e29 100644
--- a/tests/gauge_alg_test.cpp
+++ b/tests/gauge_alg_test.cpp
@@ -79,7 +79,7 @@ struct GaugeAlgTest : public ::testing::TestWithParam<test_t> {
 #ifndef QUDA_BUILD_NATIVE_FFT // skip FFT tests if FFT not available
     const ::testing::TestInfo *const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
     const char *name = test_info->name();
-    if (strcmp(name, "Landau_FFT") == 0 || strcmp(name, "Coulomb_FFT") == 0) {
+    if (strncmp(name, "Landau_FFT", 10) == 0 || strncmp(name, "Coulomb_FFT", 11) == 0) {
       execute = false;
       GTEST_SKIP();
     }
diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp
index 1efa49ef97..c7820d8575 100644
--- a/tests/gauge_path_test.cpp
+++ b/tests/gauge_path_test.cpp
@@ -179,7 +179,7 @@ void gauge_force_test(force_test_t test_param)
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
 
-  if (getTuning() == QUDA_TUNE_YES) {
+  if (getTuning()) {
     if (compute_force)
       computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3,
                             &gauge_param);
@@ -324,7 +324,7 @@ void gauge_loop_test(loop_test_t loop_param)
   // compute the plaquette as part of validation
   obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE;
 
-  if (getTuning() == QUDA_TUNE_YES) { gaugeObservablesQuda(&obsParam); }
+  if (getTuning()) { gaugeObservablesQuda(&obsParam); }
 
   quda::host_timer_t host_timer;
   // Multiple execution to exclude warmup time in the first run
diff --git a/tests/host_reference/contract_ft_reference.h b/tests/host_reference/contract_ft_reference.h
index 697ed550e2..e811d4ec99 100644
--- a/tests/host_reference/contract_ft_reference.h
+++ b/tests/host_reference/contract_ft_reference.h
@@ -58,7 +58,7 @@ template <typename Float> inline void FourierPhase(Float z[2], const Float theta
 };
 
 template <typename Float>
-void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2, double *h_result,
+void contractFTHost(void **h_prop_array_flavor_1, void **h_prop_array_flavor_2, double *h_result,
                     const QudaContractType cType, const int src_colors, const int *X, const int *const source_position,
                     const int n_mom, const int *const mom_modes, const QudaFFTSymmType *const fft_type)
 {
@@ -126,8 +126,8 @@ void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2
         for (int c1 = 0; c1 < src_colors; c1++) {
           // color contraction
           size_t off = nSpin * 3 * 2 * (Vh * parity + cb_idx);
-          contractColors<Float>(h_prop_array_flavor_1[s1 * src_colors + c1] + off,
-                                h_prop_array_flavor_2[s2 * src_colors + c1] + off, nSpin, M.data());
+          contractColors<Float>(static_cast<Float *>(h_prop_array_flavor_1[s1 * src_colors + c1]) + off,
+                                static_cast<Float *>(h_prop_array_flavor_2[s2 * src_colors + c1]) + off, nSpin, M.data());
 
           // apply gamma matrices here
 
@@ -158,7 +158,7 @@ void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2
 };
 
 template <typename Float>
-int contractionFT_reference(Float **spinorX, Float **spinorY, const double *const d_result, const QudaContractType cType,
+int contractionFT_reference(void **spinorX, void **spinorY, const double *const d_result, const QudaContractType cType,
                             const int src_colors, const int *X, const int *const source_position, const int n_mom,
                             const int *const mom_modes, const QudaFFTSymmType *const fft_type)
 {
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 5bada81761..9b6ef102c8 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -746,17 +746,17 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx)
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx)
 {
   std::vector<quda::ColorSpinorField> out_vector(1);
   out_vector[0] = out;
-  return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, src_idx);
+  return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, laplace3D, src_idx);
 }
 
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
                                                std::vector<quda::ColorSpinorField> &out_vector,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx)
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx)
 {
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   double l2r_max = 0.0;
@@ -783,7 +783,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
     for (int i = 0; i < multishift; i++) {
       auto &out = out_vector[i];
       double mass = 0.5 * sqrt(inv_param.offset[i]);
-      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D);
 
       mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
       double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
@@ -809,10 +809,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
     auto &out = out_vector[0];
     double mass = inv_param.mass;
     if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
-      stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
-
-      // correct for the massRescale function inside invertQuda
-      if (is_laplace(dslash_type)) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
+      stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D);
     } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
       QudaParity parity = QUDA_INVALID_PARITY;
       switch (inv_param.matpc_type) {
@@ -820,9 +817,9 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
       case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
       default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
       }
-      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D);
     } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-      stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
+      stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D);
     } else {
       errorQuda("Invalid staggered solution type %d", inv_param.solution_type);
     }
@@ -844,8 +841,9 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
   return {l2r_max, hqr_max};
 }
 
-double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i,
-                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link)
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector<double _Complex> &lambda, int i,
+                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                      int laplace3D)
 {
   QudaInvertParam &inv_param = *(eig_param.invert_param);
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
@@ -861,10 +859,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
     else
       sol_type = QUDA_MATDAG_MAT_SOLUTION;
   } else {
-    if (use_pc)
-      sol_type = QUDA_MATPC_SOLUTION;
-    else
-      sol_type = QUDA_MAT_SOLUTION;
+    sol_type = use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION;
   }
 
   // Create temporary spinors
@@ -872,7 +867,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
   quda::ColorSpinorField ref(csParam);
 
   if (sol_type == QUDA_MAT_SOLUTION) {
-    stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type);
+    stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D);
   } else if (sol_type == QUDA_MATPC_SOLUTION) {
     QudaParity parity = QUDA_INVALID_PARITY;
     switch (inv_param.matpc_type) {
@@ -880,26 +875,60 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
     case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
     default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
     }
-    stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type);
+    stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type, laplace3D);
   } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) {
-    stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type);
+    stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D);
   }
 
-  // Compute M * x - \lambda * x
-  caxpy(-lambda, spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
-  double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
-  double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
-  double l2r = sqrt(nrm2 / src2);
+  if (laplace3D == 3) {
+    auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size;
+    auto t_offset = spinor.X()[3] * comm_coord(3);
+    std::vector<double> nrm2(spinor.X()[3] * comm_dim(3), 0.0);
+    std::vector<double> src2(spinor.X()[3] * comm_dim(3), 0.0);
+    // Compute M * x - \lambda * x on each slice
+    for (auto t = 0; t < spinor.X()[3]; t++) {
+      auto t_global = t_offset + t;
+      auto offset = t * batch_size * inv_param.cpu_prec;
 
-  printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r);
+      for (int parity = 0; parity < spinor.SiteSubset(); parity++) {
+        caxpy(-lambda[t_global], static_cast<char *>(spinor.data()) + offset, static_cast<char *>(ref.data()) + offset,
+              batch_size, inv_param.cpu_prec);
 
-  return l2r;
+        nrm2[t_global] += norm_2(static_cast<char *>(ref.data()) + offset, batch_size, inv_param.cpu_prec, false);
+        src2[t_global] += norm_2(static_cast<char *>(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false);
+
+        offset += spinor.VolumeCB() * stag_spinor_site_size * inv_param.cpu_prec;
+      }
+    }
+
+    comm_allreduce_sum(nrm2);
+    comm_allreduce_sum(src2);
+
+    for (auto t = 0; t < spinor.X()[3] * comm_dim(3); t++) {
+      auto l = ((double *)&(lambda[t]))[0];
+      printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t, l, eig_param.tol,
+                 sqrt(nrm2[t] / src2[t]));
+    }
+    auto total_nrm2 = std::accumulate(nrm2.begin(), nrm2.end(), 0);
+    auto total_src2 = std::accumulate(src2.begin(), src2.end(), 0);
+
+    return sqrt(total_nrm2 / total_src2);
+  } else {
+    // Compute M * x - \lambda * x
+    caxpy(-lambda[0], spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double l2r = sqrt(nrm2 / src2);
+    printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r);
+    return l2r;
+  }
 }
 
 double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right,
-                                         double _Complex sigma, int i, QudaEigParam &eig_param,
-                                         quda::GaugeField &fat_link, quda::GaugeField &long_link)
+                                         const std::vector<double _Complex> &sigma, int i, QudaEigParam &eig_param,
+                                         quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D)
 {
+  if (laplace3D == 3) errorQuda("3-d Laplace operator not supported");
   QudaInvertParam &inv_param = *(eig_param.invert_param);
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false);
@@ -912,10 +941,10 @@ double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, qu
   quda::ColorSpinorField ref(csParam);
 
   // Only `mat` is used here
-  stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type);
+  stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type, laplace3D);
 
   // Compute M * x_left - \sigma * x_right
-  caxpy(-sigma, spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  caxpy(-sigma[0], spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
   double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
   double src2 = norm_2(spinor_left.data(), spinor_left.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
   double l2r = sqrt(nrm2 / src2);
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 3c9543107e..307bb1db70 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -217,11 +217,13 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
  * @param inv_param Invert params, used to query the solve type, etc
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
+ * @param src_idx The source index we working on (when doing mutil-RHS)
  * @return The residual and HQ residual (if requested)
  */
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx);
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx);
 
 /**
  * @brief Verify a single- or multi-shift staggered inversion on the host
@@ -231,26 +233,30 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda:
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
  * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
+ * @param src_idx The source index we working on (when doing mutil-RHS)
  * @return The residual and HQ residual (if requested)
  */
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
                                                std::vector<quda::ColorSpinorField> &out_vector,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx = 0);
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx = 0);
 
 /**
  * @brief Verify a staggered-type eigenvector
  *
  * @param spinor The host eigenvector to be verified
- * @param lambda The host eigenvalue to be verified
+ * @param lambda The host eigenvalue(s) to be verified
  * @param i The number of the eigenvalue, only used when printing outputs
  * @param eig_param Eigensolve params, used to query the operator type, etc
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
  * @return The residual norm
  */
-double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i,
-                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector<double _Complex> &lambda, int i,
+                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                      int laplace3D);
 
 /**
  * @brief Verify a staggered-type singular vector
@@ -262,11 +268,12 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
  * @param eig_param Eigensolve params, used to query the operator type, etc
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
  * @return The residual norm
  */
 double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right,
-                                         double _Complex sigma, int i, QudaEigParam &eig_param,
-                                         quda::GaugeField &fat_link, quda::GaugeField &long_link);
+                                         const std::vector<double _Complex> &sigma, int i, QudaEigParam &eig_param,
+                                         quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D);
 
 /**
  * @brief Verify the spinor distance reweighting
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index ce285a6b99..741efdf0b7 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -426,7 +426,7 @@ static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_
 /* This function only computes one direction @dir
  *
  */
-void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *sitelink, void *const *sitelink_ex,
+void gauge_force_reference_dir(void *refMom, int dir, double eb3, quda::GaugeField &u, quda::GaugeField &u_ex,
                                QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths,
                                const lattice_t &lat, bool compute_force)
 {
@@ -437,26 +437,30 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s
   for (int i = 0; i < num_paths; i++) {
     if (prec == QUDA_DOUBLE_PRECISION) {
       double *my_loop_coeff = (double *)loop_coeff;
-      compute_path_product((dsu3_matrix *)staple, (dsu3_matrix **)sitelink_ex, path_dir[i], length[i], my_loop_coeff[i],
-                           dir, lat);
+      compute_path_product((dsu3_matrix *)staple, u_ex.data_array<dsu3_matrix *>().data, path_dir[i], length[i],
+                           my_loop_coeff[i], dir, lat);
     } else {
       float *my_loop_coeff = (float *)loop_coeff;
-      compute_path_product((fsu3_matrix *)staple, (fsu3_matrix **)sitelink_ex, path_dir[i], length[i], my_loop_coeff[i],
-                           dir, lat);
+      compute_path_product((fsu3_matrix *)staple, u_ex.data_array<fsu3_matrix *>().data, path_dir[i], length[i],
+                           my_loop_coeff[i], dir, lat);
     }
   }
 
   if (compute_force) {
     if (prec == QUDA_DOUBLE_PRECISION) {
-      update_mom((danti_hermitmat *)refMom, dir, (dsu3_matrix **)sitelink, (dsu3_matrix *)staple, (double)eb3, lat);
+      update_mom((danti_hermitmat *)refMom, dir, u.data_array<dsu3_matrix *>().data, (dsu3_matrix *)staple, (double)eb3,
+                 lat);
     } else {
-      update_mom((fanti_hermitmat *)refMom, dir, (fsu3_matrix **)sitelink, (fsu3_matrix *)staple, (float)eb3, lat);
+      update_mom((fanti_hermitmat *)refMom, dir, u.data_array<fsu3_matrix *>().data, (fsu3_matrix *)staple, (float)eb3,
+                 lat);
     }
   } else {
     if (prec == QUDA_DOUBLE_PRECISION) {
-      update_gauge((dsu3_matrix *)refMom, dir, (dsu3_matrix **)sitelink, (dsu3_matrix *)staple, (double)eb3, lat);
+      update_gauge((dsu3_matrix *)refMom, dir, u.data_array<dsu3_matrix *>().data, (dsu3_matrix *)staple, (double)eb3,
+                   lat);
     } else {
-      update_gauge((fsu3_matrix *)refMom, dir, (fsu3_matrix **)sitelink, (fsu3_matrix *)staple, (float)eb3, lat);
+      update_gauge((fsu3_matrix *)refMom, dir, u.data_array<fsu3_matrix *>().data, (fsu3_matrix *)staple, (float)eb3,
+                   lat);
     }
   }
   host_free(staple);
@@ -465,8 +469,6 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s
 void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir, int *length,
                            void *loop_coeff, int num_paths, bool compute_force)
 {
-  void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
-
   // created extended field
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
@@ -475,13 +477,12 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
   param.gauge_order = QUDA_QDP_GAUGE_ORDER;
   param.t_boundary = QUDA_PERIODIC_T;
 
-  auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
+  auto qdp_ex = quda::createExtendedGauge(u.data_array().data, param, R);
   lattice_t lat(*qdp_ex);
 
-  void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
   for (int dir = 0; dir < 4; dir++) {
-    gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, u.Precision(), path_dir[dir], length, loop_coeff,
-                              num_paths, lat, compute_force);
+    gauge_force_reference_dir(refMom, dir, eb3, u, *qdp_ex, u.Precision(), path_dir[dir], length, loop_coeff, num_paths,
+                              lat, compute_force);
   }
 
   delete qdp_ex;
@@ -490,8 +491,6 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
 void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces, double factor,
                                 int **input_path, int *length, double *path_coeff, int num_paths)
 {
-  void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
-
   // create extended field
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
@@ -499,20 +498,20 @@ void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex>
   setGaugeParam(param);
   param.gauge_order = QUDA_QDP_GAUGE_ORDER;
   param.t_boundary = QUDA_PERIODIC_T;
-
-  auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
+  auto qdp_ex = quda::createExtendedGauge(u.data_array().data, param, R);
   lattice_t lat(*qdp_ex);
-  void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
 
   std::vector<double> loop_tr_dbl(2 * num_paths);
 
   for (int i = 0; i < num_paths; i++) {
     if (u.Precision() == QUDA_DOUBLE_PRECISION) {
-      dcomplex tr = compute_loop_trace((dsu3_matrix **)sitelink_ex, input_path[i], length[i], path_coeff[i], lat);
+      dcomplex tr
+        = compute_loop_trace(qdp_ex->data_array<dsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
       loop_tr_dbl[2 * i] = factor * tr.real;
       loop_tr_dbl[2 * i + 1] = factor * tr.imag;
     } else {
-      dcomplex tr = compute_loop_trace((fsu3_matrix **)sitelink_ex, input_path[i], length[i], path_coeff[i], lat);
+      dcomplex tr
+        = compute_loop_trace(qdp_ex->data_array<fsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
       loop_tr_dbl[2 * i] = factor * tr.real;
       loop_tr_dbl[2 * i + 1] = factor * tr.imag;
     }
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 1bef7b74bb..5da6dc1a7a 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -117,9 +117,9 @@ void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, siz
 void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, QudaPrecision precision, size_t nhops)
 {
   if (precision == QUDA_SINGLE_PRECISION) {
-    computeLinkOrderedOuterProduct<fsu3_matrix>((fsu3_vector *)src, dst, nhops);
+    computeLinkOrderedOuterProduct<fsu3_matrix>(static_cast<fsu3_vector *>(src), dst, nhops);
   } else {
-    computeLinkOrderedOuterProduct<dsu3_matrix>((dsu3_vector *)src, dst, nhops);
+    computeLinkOrderedOuterProduct<dsu3_matrix>(static_cast<dsu3_vector *>(src), dst, nhops);
   }
 }
 
@@ -342,7 +342,7 @@ template <class Real> class LoadStore
   void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index,
                            Matrix<3, std::complex<Real>> *const mat) const;
 
-  void loadMatrixFromField(const Real *const field, int oddBit, int dir, int half_lattice_index,
+  void loadMatrixFromField(const Real *const *const field, int oddBit, int dir, int half_lattice_index,
                            Matrix<3, std::complex<Real>> *const mat) const;
 
   void storeMatrixToField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int half_lattice_index,
@@ -352,12 +352,12 @@ template <class Real> class LoadStore
                         Real *const) const;
 
   void addMatrixToField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int dir, int half_lattice_index,
-                        Real coeff, Real *const) const;
+                        Real coeff, Real *const *const) const;
 
   void storeMatrixToMomentumField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int dir, int half_lattice_index,
                                   Real coeff, Real *const) const;
-  Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const;
-  void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
+  Real getData(const Real *const *const field, int idx, int dir, int oddBit, int offset, int hfv) const;
+  void addData(Real *const *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
   int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const;
   int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const;
 };
@@ -423,16 +423,16 @@ int LoadStore<Real>::half_idx_conversion_normal2ex(int half_lattice_index, const
 }
 
 template <class Real>
-Real LoadStore<Real>::getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const
+Real LoadStore<Real>::getData(const Real *const *const field, int idx, int dir, int oddBit, int offset, int hfv) const
 {
   // QDP format
-  return ((Real **)field)[dir][(hfv * oddBit + idx) * 18 + offset];
+  return field[dir][(hfv * oddBit + idx) * 18 + offset];
 }
 template <class Real>
-void LoadStore<Real>::addData(Real *const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const
+void LoadStore<Real>::addData(Real *const *const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const
 {
   // QDP format
-  ((Real **)field)[dir][(hfv * oddBit + idx) * 18 + offset] += v;
+  field[dir][(hfv * oddBit + idx) * 18 + offset] += v;
 }
 
 template <class Real>
@@ -455,7 +455,7 @@ void LoadStore<Real>::loadMatrixFromField(const Real *const field, int oddBit, i
 }
 
 template <class Real>
-void LoadStore<Real>::loadMatrixFromField(const Real *const field, int oddBit, int dir, int half_lattice_index,
+void LoadStore<Real>::loadMatrixFromField(const Real *const *const field, int oddBit, int dir, int half_lattice_index,
                                           Matrix<3, std::complex<Real>> *const mat) const
 {
 #ifdef MULTI_GPU
@@ -464,11 +464,10 @@ void LoadStore<Real>::loadMatrixFromField(const Real *const field, int oddBit, i
   int hfv = Vh;
 #endif
 
-  // const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
   int offset = 0;
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
-      (*mat)(i, j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
+      (*mat)(i, j) = getData(field, half_lattice_index, dir, oddBit, offset++, hfv);
       (*mat)(i, j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
     }
   }
@@ -515,23 +514,18 @@ void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real>> &mat,
 
 template <class Real>
 void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int dir,
-                                       int half_lattice_index, Real coeff, Real *const field) const
+                                       int half_lattice_index, Real coeff, Real *const *const field) const
 {
-
 #ifdef MULTI_GPU
   int hfv = Vh_ex;
 #else
   int hfv = Vh;
 #endif
 
-  // Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
   int offset = 0;
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
-      // local_field[offset++] += coeff*mat(i,j).real();
       addData(field, half_lattice_index, dir, oddBit, offset++, coeff * mat(i, j).real(), hfv);
-
-      // local_field[offset++] += coeff*mat(i,j).imag();
       addData(field, half_lattice_index, dir, oddBit, offset++, coeff * mat(i, j).imag(), hfv);
     }
   }
@@ -762,7 +756,8 @@ void computeOneLinkSite(
 #else
   const int[],
 #endif
-  int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore<Real> &ls, Real *const output)
+  int half_lattice_index, const Real *const *const oprod, int sig, Real coeff, const LoadStore<Real> &ls,
+  Real *const *const output)
 {
   if (GOES_FORWARDS(sig)) {
     typename ColorMatrix<Real>::Type colorMatW;
@@ -777,7 +772,7 @@ void computeOneLinkSite(
 }
 
 template <class Real>
-void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output)
+void computeOneLinkField(const int dim[4], const Real *const *const oprod, int sig, Real coeff, Real *const *const output)
 {
   int volume = 1;
   for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
@@ -795,10 +790,10 @@ void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Rea
 // middleLinkKernel compiles for now, but lots of debugging to be done
 template <class Real, int oddBit>
 void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
-                           const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
+                           const int dim[4], void *const oprod, const Real *const Qprev, const Real *const *const link,
                            int sig, int mu, Real coeff,
                            const LoadStore<Real> &ls, // pass a function object to read from and write to matrix fields
-                           Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
+                           Real *const Pmu, Real *const P3, Real *const Qmu, Real *const *const newOprod)
 {
   const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
   const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
@@ -842,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett
 
   if (Qprev == NULL) {
     if (sig_positive) {
-      ls.loadMatrixFromField(oprod, 1 - oddBit, sig, point_d, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
     } else {
-      ls.loadMatrixFromField(oprod, oddBit, OPP_DIR(sig), point_c, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
       colorMatY = conj(colorMatY);
     }
   } else { // Qprev != NULL
-    ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
+    ls.loadMatrixFromField(static_cast<const Real *>(oprod), oddBit, point_c, &colorMatY);
   }
 
   colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY;
@@ -880,9 +875,9 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett
 } // computeMiddleLinkSite
 
 template <class Real>
-void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
+void computeMiddleLinkField(const int dim[4], void *const oprod, const Real *const Qprev, const Real *const *const link,
                             int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu,
-                            Real *const newOprod)
+                            Real *const *const newOprod)
 {
 
   int volume = 1;
@@ -911,9 +906,9 @@ template <class Real, int oddBit>
 void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
                          const int dim[4], const Real *const P3,
                          const Real *const Qprod, // why?
-                         const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff,
+                         const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
                          const LoadStore<Real> &ls, // pass a function object to read from and write to matrix fields
-                         Real *const shortP, Real *const newOprod)
+                         Real *const shortP, Real *const *const newOprod)
 {
 
   const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
@@ -979,8 +974,8 @@ void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better
 template <class Real>
 void computeSideLinkField(const int dim[4], const Real *const P3,
                           const Real *const Qprod, // why?
-                          const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP,
-                          Real *const newOprod)
+                          const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
+                          Real *const shortP, Real *const *const newOprod)
 {
   // Need some way of setting half_volume
   int volume = 1;
@@ -1005,10 +1000,10 @@ void computeSideLinkField(const int dim[4], const Real *const P3,
 
 template <class Real, int oddBit>
 void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
-                        const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
-                        int sig, int mu, Real coeff, Real accumu_coeff,
+                        const int dim[4], const Real *const oprod, const Real *const Qprev,
+                        const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
                         const LoadStore<Real> &ls, // pass a function object to read from and write to matrix fields
-                        Real *const shortP, Real *const newOprod)
+                        Real *const shortP, Real *const *const newOprod)
 {
 
   const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
@@ -1092,8 +1087,9 @@ void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better
 } // allLinkKernel
 
 template <class Real>
-void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
-                         int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
+void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev,
+                         const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
+                         Real *const shortP, Real *const *const newOprod)
 {
   int volume = 1;
   for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
@@ -1132,8 +1128,8 @@ template <class Real> struct PathCoefficients {
 };
 
 template <class Real>
-void doHisqStaplesForceCPU(const int dim[4], PathCoefficients<double> staple_coeff, Real *oprod, Real *link,
-                           Real **tempmat, Real *newOprod)
+void doHisqStaplesForceCPU(const int dim[4], PathCoefficients<double> staple_coeff, Real **oprod, Real **link,
+                           Real **tempmat, Real **newOprod)
 {
   Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff;
 
@@ -1217,9 +1213,6 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
   uint64_t len = 1;
   for (int dir = 0; dir < 4; ++dir) len *= X_[dir];
 #endif
-  // allocate memory for temporary fields
-  void *tempmat[6];
-  for (int i = 0; i < 6; i++) { tempmat[i] = safe_malloc(len * 18 * precision); }
 
   PathCoefficients<double> act_path_coeff;
   act_path_coeff.one = path_coeff[0];
@@ -1229,27 +1222,29 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
   act_path_coeff.seven = path_coeff[4];
   act_path_coeff.lepage = path_coeff[5];
 
-  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
-  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
-  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
-  if (precision == QUDA_DOUBLE_PRECISION) {
-    doHisqStaplesForceCPU<double>(X_, act_path_coeff, reinterpret_cast<double *>(oprod_array),
-                                  reinterpret_cast<double *>(link_array), (double **)tempmat,
-                                  reinterpret_cast<double *>(noprod_array));
-  } else if (precision == QUDA_SINGLE_PRECISION) {
-    doHisqStaplesForceCPU<float>(X_, act_path_coeff, reinterpret_cast<float *>(oprod_array),
-                                 reinterpret_cast<float *>(link_array), (float **)tempmat,
-                                 reinterpret_cast<float *>(noprod_array));
+  if (precision == QUDA_SINGLE_PRECISION) {
+    // allocate memory for temporary fields
+    float *tempmat[6];
+    for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<float *>(safe_malloc(len * 18 * precision)); }
+    doHisqStaplesForceCPU<float>(X_, act_path_coeff, oprod.data_array<float *>().data, link.data_array<float *>().data,
+                                 tempmat, newOprod->data_array<float *>().data);
+    for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
+  } else if (precision == QUDA_DOUBLE_PRECISION) {
+    // allocate memory for temporary fields
+    double *tempmat[6];
+    for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<double *>(safe_malloc(len * 18 * precision)); }
+    doHisqStaplesForceCPU<double>(X_, act_path_coeff, oprod.data_array<double *>().data,
+                                  link.data_array<double *>().data, tempmat, newOprod->data_array<double *>().data);
+    for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
   } else {
     errorQuda("Unsupported precision");
   }
-
-  for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
 }
 
 template <class Real, int oddBit>
-void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link,
-                         int sig, Real coeff, const LoadStore<Real> &ls, Real *const output)
+void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const *const oprod,
+                         const Real *const *const link, int sig, Real coeff, const LoadStore<Real> &ls,
+                         Real *const *const output)
 {
   if (GOES_FORWARDS(sig)) {
 
@@ -1296,8 +1291,8 @@ void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *c
 }
 
 template <class Real>
-void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff,
-                          Real *const output)
+void computeLongLinkField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig,
+                          Real coeff, Real *const *const output)
 {
   int volume = 1;
   for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
@@ -1321,16 +1316,13 @@ void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeFiel
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
-  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
-  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
-  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      computeLongLinkField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array),
-                                  sig, coeff, reinterpret_cast<float *>(noprod_array));
+      computeLongLinkField<float>(X_, oprod.data_array<float *>().data, link.data_array<float *>().data, sig, coeff,
+                                  newOprod->data_array<float *>().data);
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      computeLongLinkField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
-                                   sig, coeff, reinterpret_cast<double *>(noprod_array));
+      computeLongLinkField<double>(X_, oprod.data_array<double *>().data, link.data_array<double *>().data, sig, coeff,
+                                   newOprod->data_array<double *>().data);
     } else {
       errorQuda("Unrecognised precision");
     }
@@ -1344,8 +1336,8 @@ void completeForceSite(int half_lattice_index,
 #else
                        const int[],
 #endif
-                       const Real *const oprod, const Real *const link, int sig, const LoadStore<Real> &ls,
-                       Real *const mom)
+                       const Real *const *const oprod, const Real *const *const link, int sig,
+                       const LoadStore<Real> &ls, Real *const mom)
 {
 
   typename ColorMatrix<Real>::Type colorMatX, colorMatY, linkW;
@@ -1366,7 +1358,8 @@ void completeForceSite(int half_lattice_index,
 }
 
 template <class Real>
-void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom)
+void completeForceField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig,
+                        Real *const mom)
 {
   int volume = dim[0] * dim[1] * dim[2] * dim[3];
   const int half_volume = volume / 2;
@@ -1384,15 +1377,13 @@ void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda:
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
-  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
-  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      completeForceField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array), sig,
+      completeForceField<float>(X_, oprod.data_array<float *>().data, link.data_array<float *>().data, sig,
                                 mom->data<float *>());
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      completeForceField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
-                                 sig, mom->data<double *>());
+      completeForceField<double>(X_, oprod.data_array<double *>().data, link.data_array<double *>().data, sig,
+                                 mom->data<double *>());
     } else {
       errorQuda("Unrecognised precision");
     }
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 212549dc38..eb7eb3d604 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -29,13 +29,19 @@
  * @param[in] oddBit The odd/even bit for the site index
  * @param[in] daggerBit Perform the ordinary dslash (0) or Hermitian conjugate (1)
  * @param[in] dslash_type The type of Dslash operation
+ * @param[in] laplace3D Whether we applying the 3-d laplace operator
+ * (in the case of dslash_type being QUDA_LAPLACE_DSLASH)
  */
 template <typename real_t>
 void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const real_t *const *longlink,
                               const real_t *const *ghostFatlink, const real_t *const *ghostLonglink,
                               const real_t *spinorField, const real_t *const *fwd_nbr_spinor,
-                              const real_t *const *back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type)
+                              const real_t *const *back_nbr_spinor, int oddBit, int daggerBit,
+                              QudaDslashType dslash_type, int laplace3D)
 {
+  if (laplace3D < 4 && dslash_type != QUDA_LAPLACE_DSLASH)
+    errorQuda("laplace3D = %d only supported for Laplace dslash (%d requested)", laplace3D, dslash_type);
+
 #pragma omp parallel for
   for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0;
 
@@ -66,6 +72,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r
     int offset = stag_spinor_site_size * sid;
 
     for (int dir = 0; dir < 8; dir++) {
+      if (laplace3D == dir / 2) continue; // skip dimensions if needed
       const int nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1;
       const real_t *fatlnk
         = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, ghostFatlinkEven, ghostFatlinkOdd, 1, 1);
@@ -108,7 +115,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r
 }
 
 void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type)
+                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D)
 {
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) {
@@ -139,27 +146,26 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
                             long_link.Ghost()[3].data()};
 
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
-    staggeredDslashReference(static_cast<double *>(out.data()), reinterpret_cast<double **>(qdp_fatlink),
+    staggeredDslashReference(out.data<double *>(), reinterpret_cast<double **>(qdp_fatlink),
                              reinterpret_cast<double **>(qdp_longlink), reinterpret_cast<double **>(ghost_fatlink),
-                             reinterpret_cast<double **>(ghost_longlink), static_cast<double *>(in.data()),
+                             reinterpret_cast<double **>(ghost_longlink), in.data<double *>(),
                              reinterpret_cast<double **>(in.fwdGhostFaceBuffer),
-                             reinterpret_cast<double **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
+                             reinterpret_cast<double **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type,
+                             laplace3D);
   } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
-    staggeredDslashReference(static_cast<float *>(out.data()), reinterpret_cast<float **>(qdp_fatlink),
+    staggeredDslashReference(out.data<float *>(), reinterpret_cast<float **>(qdp_fatlink),
                              reinterpret_cast<float **>(qdp_longlink), reinterpret_cast<float **>(ghost_fatlink),
-                             reinterpret_cast<float **>(ghost_longlink), static_cast<float *>(in.data()),
+                             reinterpret_cast<float **>(ghost_longlink), in.data<float *>(),
                              reinterpret_cast<float **>(in.fwdGhostFaceBuffer),
-                             reinterpret_cast<float **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
+                             reinterpret_cast<float **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type,
+                             laplace3D);
   }
 }
 
 void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
+              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D)
 {
-  // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) {
-    errorQuda("The spinor precision and gauge precision are not the same");
-  }
+  checkPrecision(in, fat_link);
 
   // assert we have full-parity spinors
   if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
@@ -169,11 +175,12 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
   // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not
   // have the minus sign. Inverting the expected dagger convention
   // solves this discrepancy.
-  stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type);
-  stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type);
+  stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type, laplace3D);
+  stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type, laplace3D);
 
   if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    double kappa = 1.0 / (8 + mass);
+    int dimension = laplace3D < 4 ? 3 : 4;
+    double kappa = 1.0 / (2 * dimension + mass);
     xpay(in.data(), kappa, out.data(), out.Length(), out.Precision());
   } else {
     axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision());
@@ -181,12 +188,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
 }
 
 void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
+                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D)
 {
-  // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) {
-    errorQuda("The spinor precision and gauge precision are not the same");
-  }
+  checkPrecision(in, fat_link);
 
   // assert we have full-parity spinors
   if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
@@ -197,15 +201,16 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga
   quda::ColorSpinorField tmp(csParam);
 
   // Apply mat in sequence
-  stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type);
-  stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type);
+  stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type, laplace3D);
+  stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type, laplace3D);
 }
 
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type)
+                const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type,
+                int laplace3D)
 {
-  // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); }
+  if (laplace3D < 4) errorQuda("Cannot use 3-d operator with e/o preconditioning");
+  checkPrecision(in, fat_link);
 
   // assert we have single-parity spinors
   if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET)
@@ -225,8 +230,8 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi
   quda::ColorSpinorField tmp(csParam);
 
   // dagger bit does not matter
-  stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type);
-  stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type);
+  stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type, laplace3D);
+  stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type, laplace3D);
 
   double msq_x4 = mass * mass * 4;
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 34625abfa3..557be21bd0 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -21,9 +21,13 @@ void setDims(int *);
  * @param[in] oddBit 0 for D_eo, 1 for D_oe
  * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type);
+                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D);
 
 /**
  * @brief Apply the full parity staggered-type dslash
@@ -35,9 +39,13 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
  * @param[in] mass Mass for the dslash operator
  * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type);
+              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D);
 
 /**
  * @brief Apply the full parity staggered-type matdag_mat
@@ -49,9 +57,13 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
  * @param[in] mass Mass for the dslash operator
  * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type);
+                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D);
 
 /**
  * @brief Apply the even-even or odd-odd preconditioned staggered dslash
@@ -64,6 +76,11 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga
  * @param[in] dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator
  * @param[in] parity Parity of preconditioned dslash
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type);
+                const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type,
+                int laplace3D);
diff --git a/tests/io_test.cpp b/tests/io_test.cpp
index 2618b0e479..25296b02b9 100644
--- a/tests/io_test.cpp
+++ b/tests/io_test.cpp
@@ -31,6 +31,9 @@ TEST_P(GaugeIOTest, verify)
   gauge_param.cpu_prec = ::testing::get<0>(param);
   if (!quda::is_enabled(gauge_param.cpu_prec)) GTEST_SKIP();
   gauge_param.cuda_prec = gauge_param.cpu_prec;
+  gauge_param.cuda_prec_sloppy = gauge_param.cpu_prec;
+  gauge_param.cuda_prec_precondition = gauge_param.cpu_prec;
+  gauge_param.cuda_prec_eigensolver = gauge_param.cpu_prec;
   gauge_param.t_boundary = QUDA_PERIODIC_T;
 
   // Allocate host side memory for the gauge field.
diff --git a/tests/laph_test.cpp b/tests/laph_test.cpp
index ac20bce407..75f244d406 100644
--- a/tests/laph_test.cpp
+++ b/tests/laph_test.cpp
@@ -114,8 +114,8 @@ auto laph_test(test_t param)
   std::vector<Complex> qudaRes(nSink * nEv * Lt * nSpin, 0.);
 
   int X[4] = {xdim, ydim, zdim, tdim};
-  laphSinkProject((__complex__ double *)qudaRes.data(), (void **)snkPtr.data(), nSink, tileSink,
-                  (void **)evPtr.data(), nEv, tileEv, &invParam, X);
+  laphSinkProject((__complex__ double *)qudaRes.data(), snkPtr.data(), nSink, tileSink, evPtr.data(), nEv, tileEv,
+                  &invParam, X);
   printfQuda("laphSinkProject Done: %g secs, %g Gflops\n", invParam.secs, invParam.gflops / invParam.secs);
 
   auto tol = getTolerance(cuda_prec);
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index a9b499d71a..1b5609bfda 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -85,14 +85,16 @@ struct StaggeredDslashTestWrapper {
     for (int i = 0; i < Nsrc; i++) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type);
+        stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type, laplace3D);
         break;
       case dslash_test_type::MatPC:
-        stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type);
+        stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type, laplace3D);
+        break;
+      case dslash_test_type::Mat:
+        stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D);
         break;
-      case dslash_test_type::Mat: stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type); break;
       case dslash_test_type::MatDagMat:
-        stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type);
+        stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D);
         break;
       default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
       }
@@ -406,12 +408,12 @@ struct StaggeredDslashTestWrapper {
 
       size_t ghost_bytes = cudaSpinor[0].GhostBytes();
 
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
       ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes);
 
       printfQuda(
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 93ae358896..95ebff044b 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -75,12 +75,6 @@ void init()
   gauge_param = newQudaGaugeParam();
   setStaggeredGaugeParam(gauge_param);
 
-  QudaGaugeSmearParam smear_param;
-  if (gauge_smear) {
-    smear_param = newQudaGaugeSmearParam();
-    setGaugeSmearParam(smear_param);
-  }
-
   // Though no inversions are performed, the inv_param
   // structure contains all the information we need to
   // construct the dirac operator.
@@ -166,10 +160,20 @@ std::vector<double> eigensolve(test_t test_param)
   eig_param.compute_svd = ::testing::get<3>(test_param);
   eig_param.spectrum = ::testing::get<4>(test_param);
 
-  if (eig_param.use_pc)
-    eig_inv_param.solution_type = QUDA_MATPC_SOLUTION;
-  else
-    eig_inv_param.solution_type = QUDA_MAT_SOLUTION;
+  eig_inv_param.solution_type = eig_param.use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION;
+
+  // whether we are using the resident smeared gauge or not
+  eig_param.use_smeared_gauge = gauge_smear;
+
+  if (dslash_type == QUDA_LAPLACE_DSLASH) {
+    int dimension = laplace3D < 4 ? 3 : 4;
+    eig_inv_param.kappa = 1.0 / (2 * dimension + mass);
+    eig_inv_param.laplace3D = laplace3D;
+    if (dimension == 3) {
+      eig_param.ortho_dim = laplace3D;
+      eig_param.ortho_dim_size_local = tdim;
+    }
+  }
 
   // For gtest testing, we prohibit the use of polynomial acceleration as
   // the fine tuning required can inhibit convergence of an otherwise
@@ -192,24 +196,65 @@ std::vector<double> eigensolve(test_t test_param)
 
   if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) display_test_info(eig_param);
 
+  // Gauge Smearing Routines
+  if (gauge_smear) {
+    quda::host_timer_t host_timer;
+    host_timer.start(); // start the timer
+
+    std::vector<QudaGaugeObservableParam> obs_param(gauge_smear_steps / measurement_interval + 1);
+    for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
+      obs_param[i] = newQudaGaugeObservableParam();
+      obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE;
+      obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE;
+      obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
+    }
+
+    // We here set all the problem parameters for all possible smearing types.
+    QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam();
+    setGaugeSmearParam(smear_param);
+
+    switch (smear_param.smear_type) {
+    case QUDA_GAUGE_SMEAR_APE:
+    case QUDA_GAUGE_SMEAR_STOUT:
+    case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
+    case QUDA_GAUGE_SMEAR_HYP: {
+      performGaugeSmearQuda(&smear_param, obs_param.data());
+      break;
+    }
+
+      // Here we use a typical use case which is different from simple smearing in that
+      // the user will want to compute the plaquette values to compute the gauge energy.
+    case QUDA_GAUGE_SMEAR_WILSON_FLOW:
+    case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: {
+      for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
+        obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE;
+      }
+      performWFlowQuda(&smear_param, obs_param.data());
+      break;
+    }
+    default: errorQuda("Undefined gauge smear type %d given", smear_param.smear_type);
+    }
+
+    host_timer.stop();
+    printfQuda("Time for gauge smearing = %f\n", host_timer.last());
+  }
+
   // Vector construct START
   //----------------------------------------------------------------------------
   // Host side arrays to store the eigenpairs computed by QUDA
   int n_eig = eig_n_conv;
   if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2;
-  std::vector<quda::ColorSpinorField> evecs(n_eig);
   quda::ColorSpinorParam cs_param;
   constructStaggeredTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param);
   // Void pointers to host side arrays, compatible with the QUDA interface.
   std::vector<void *> host_evecs_ptr(n_eig);
   // Allocate host side memory and pointers
-  for (int i = 0; i < n_eig; i++) {
-    evecs[i] = quda::ColorSpinorField(cs_param);
-    host_evecs_ptr[i] = evecs[i].data();
-  }
+  std::vector<quda::ColorSpinorField> evecs(n_eig, cs_param);
+  for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data();
 
   // Complex eigenvalues
-  std::vector<__complex__ double> evals(eig_n_conv);
+  int n_batch = laplace3D == 3 ? eig_param.ortho_dim_size_local * comm_dim(3) : 1;
+  std::vector<__complex__ double> evals(eig_n_conv * n_batch);
   // Vector construct END
   //----------------------------------------------------------------------------
 
@@ -226,19 +271,20 @@ std::vector<double> eigensolve(test_t test_param)
   printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last());
 
   // Perform host side verification of eigenvector if requested.
-  // ...
 
   std::vector<double> residua(eig_n_conv, 0.0);
   // Perform host side verification of eigenvector if requested.
   if (verify_results) {
     for (int i = 0; i < eig_n_conv; i++) {
       if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) {
-        double _Complex sigma = evals[i];
+        std::vector<double _Complex> sigma(n_batch);
+        for (auto b = 0; b < n_batch; b++) sigma[b] = evals[b * eig_n_conv + i];
         residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP,
-                                                       cpuLongQDP);
+                                                       cpuLongQDP, laplace3D);
       } else {
-        double _Complex lambda = evals[i];
-        residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP);
+        std::vector<double _Complex> lambda(n_batch);
+        for (auto b = 0; b < n_batch; b++) lambda[b] = evals[b * eig_n_conv + i];
+        residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP, laplace3D);
       }
     }
   }
@@ -285,7 +331,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (is_laplace(dslash_type))
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
     if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index c73c85378e..8918c3df4d 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -32,6 +32,9 @@ bool skip_test(test_t test_param)
   auto compute_svd = ::testing::get<3>(test_param);
   auto spectrum = ::testing::get<4>(test_param);
 
+  // 3-d operator only supported for Laplace
+  if (eig_type == QUDA_EIG_TR_LANCZOS_3D && dslash_type != QUDA_LAPLACE_DSLASH) return true;
+
   // Reverse engineer the operator type
   QudaSolveType combo_solve_type = get_solve_type(use_norm_op, use_pc, compute_svd);
   if (combo_solve_type == QUDA_DIRECT_PC_SOLVE) {
@@ -87,6 +90,7 @@ bool skip_test(test_t test_param)
     case QUDA_LAPLACE_DSLASH:
       switch (eig_type) {
       case QUDA_EIG_TR_LANCZOS:
+      case QUDA_EIG_TR_LANCZOS_3D:
       case QUDA_EIG_BLK_TR_LANCZOS:
         if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true;
         break;
@@ -117,6 +121,8 @@ TEST_P(StaggeredEigensolveTest, verify)
   auto eig_type = ::testing::get<0>(GetParam());
   if (eig_type == QUDA_EIG_IR_ARNOLDI || eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10;
   auto tol = factor * eig_param.tol;
+  if (dslash_type == QUDA_LAPLACE_DSLASH) laplace3D = eig_type == QUDA_EIG_TR_LANCZOS_3D ? 3 : 4;
+
   for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol);
 }
 
@@ -143,6 +149,9 @@ auto hermitian_solvers = Values(QUDA_EIG_TR_LANCZOS, QUDA_EIG_BLK_TR_LANCZOS, QU
 // Can solve non-hermitian systems
 auto non_hermitian_solvers = Values(QUDA_EIG_IR_ARNOLDI);
 
+// Batched solvers for 3-d operators
+auto batched_solvers = Values(QUDA_EIG_TR_LANCZOS_3D);
+
 // Eigensolver spectrum types
 auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG);
 auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG,
@@ -171,3 +180,9 @@ INSTANTIATE_TEST_SUITE_P(DirectFull, StaggeredEigensolveTest,
                          ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
                                             Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum),
                          gettestname);
+
+// 3-d full system direct solve
+INSTANTIATE_TEST_SUITE_P(DirectFull3D, StaggeredEigensolveTest,
+                         ::testing::Combine(batched_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
+                                            Values(QUDA_BOOLEAN_FALSE), hermitian_spectrum),
+                         gettestname);
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index a098324ef6..77d9cd5a75 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -450,9 +450,9 @@ std::vector<std::array<double, 2>> solve(test_t param)
         // Create an appropriate subset of the full out_multishift vector
         std::vector<quda::ColorSpinorField> out_subset
           = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift};
-        res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param);
+        res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param, laplace3D);
       } else {
-        res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, n);
+        res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, laplace3D, n);
       }
     }
   }
@@ -510,7 +510,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (is_laplace(dslash_type))
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
     if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 7ef2c270b3..1b659eec59 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -53,6 +53,10 @@ bool skip_test(test_t param)
     // MR struggles with the staggered and asqtad spectrum, it's not MR's fault
     if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER)
       return true;
+
+    // BiCGStab-L is unstable with staggered operator with low precision
+    // we could likely restore this when 20-bit quark fields are merged in
+    if (inverter_type == QUDA_BICGSTABL_INVERTER && prec_sloppy < QUDA_SINGLE_PRECISION) return true;
   }
 
   // CG3 is rather unstable with low precision
diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp
index 265fed1248..5ddfc52956 100644
--- a/tests/su3_test.cpp
+++ b/tests/su3_test.cpp
@@ -18,19 +18,6 @@
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-// Smearing variables
-double gauge_smear_rho = 0.1;
-double gauge_smear_epsilon = 0.1;
-double gauge_smear_alpha = 0.6;
-double gauge_smear_alpha1 = 0.75;
-double gauge_smear_alpha2 = 0.6;
-double gauge_smear_alpha3 = 0.3;
-int gauge_smear_steps = 50;
-QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
-int gauge_smear_dir_ignore = -1;
-int measurement_interval = 5;
-bool su_project = true;
-
 void display_test_info()
 {
   printfQuda("running the following test:\n");
@@ -68,49 +55,6 @@ void display_test_info()
   return;
 }
 
-void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
-{
-  CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE},
-                                                                {"stout", QUDA_GAUGE_SMEAR_STOUT},
-                                                                {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT},
-                                                                {"hyp", QUDA_GAUGE_SMEAR_HYP},
-                                                                {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW},
-                                                                {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}};
-
-  // Option group for SU(3) related options
-  auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests");
-
-  opgroup
-    ->add_option(
-      "--su3-smear-type",
-      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)")
-    ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
-  ;
-  opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)");
-
-  opgroup->add_option("--su3-smear-rho", gauge_smear_rho,
-                      "rho coefficient for Stout and Over-Improved Stout smearing (default 0.1)");
-
-  opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon,
-                      "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)");
-
-  opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)");
-  opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
-  opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
-
-  opgroup->add_option(
-    "--su3-smear-dir-ignore", gauge_smear_dir_ignore,
-    "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)");
-
-  opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)");
-
-  opgroup->add_option("--su3-measurement-interval", measurement_interval,
-                      "Measure the field energy and/or topological charge every Nth step (default 5) ");
-
-  opgroup->add_option("--su3-project", su_project,
-                      "Project smeared gauge onto su3 manifold at measurement interval (default true)");
-}
-
 int main(int argc, char **argv)
 {
 
@@ -257,7 +201,7 @@ int main(int argc, char **argv)
   QudaGaugeObservableParam *obs_param = new QudaGaugeObservableParam[gauge_smear_steps / measurement_interval + 1];
   for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
     obs_param[i] = newQudaGaugeObservableParam();
-    obs_param[i].compute_plaquette = QUDA_BOOLEAN_FALSE;
+    obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE;
     obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE;
     obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
   }
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index 467b688768..0c15b5ea56 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -73,7 +73,6 @@ quda::mgarray<bool> mg_vec_partfile = {};
 QudaInverterType inv_type;
 bool inv_deflate = false;
 bool inv_multigrid = false;
-bool gauge_smear = false;
 QudaInverterType precon_type = QUDA_INVALID_INVERTER;
 QudaSchwarzType precon_schwarz_type = QUDA_INVALID_SCHWARZ;
 QudaAcceleratorType precon_accelerator_type = QUDA_INVALID_ACCELERATOR;
@@ -303,13 +302,18 @@ double eofa_mq2 = 0.85;
 double eofa_mq3 = 1.0;
 
 // SU(3) smearing options
+bool gauge_smear = false;
+QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
 double gauge_smear_rho = 0.1;
-double gauge_smear_epsilon = 1.0;
+double gauge_smear_epsilon = 0.1;
 double gauge_smear_alpha = 0.6;
+double gauge_smear_alpha1 = 0.75;
+double gauge_smear_alpha2 = 0.6;
+double gauge_smear_alpha3 = 0.3;
 int gauge_smear_steps = 5;
-QudaWFlowType wflow_type = QUDA_WFLOW_TYPE_WILSON;
+int gauge_smear_dir_ignore = -1;
 int measurement_interval = 5;
-QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
+bool su_project = true;
 
 // contract options
 QudaContractType contract_type = QUDA_CONTRACT_TYPE_STAGGERED_FT_T;
@@ -411,6 +415,7 @@ namespace
 
   CLI::TransformPairs<QudaEigType> eig_type_map {{"trlm", QUDA_EIG_TR_LANCZOS},
                                                  {"blktrlm", QUDA_EIG_BLK_TR_LANCZOS},
+                                                 {"trlm-3d", QUDA_EIG_TR_LANCZOS_3D},
                                                  {"iram", QUDA_EIG_IR_ARNOLDI},
                                                  {"blkiram", QUDA_EIG_BLK_IR_ARNOLDI}};
 
@@ -454,11 +459,12 @@ namespace
     {"SR", QUDA_SPECTRUM_SR_EIG}, {"LR", QUDA_SPECTRUM_LR_EIG}, {"SM", QUDA_SPECTRUM_SM_EIG},
     {"LM", QUDA_SPECTRUM_LM_EIG}, {"SI", QUDA_SPECTRUM_SI_EIG}, {"LI", QUDA_SPECTRUM_LI_EIG}};
 
-  CLI::TransformPairs<QudaWFlowType> wflow_type_map {{"wilson", QUDA_WFLOW_TYPE_WILSON},
-                                                     {"symanzik", QUDA_WFLOW_TYPE_SYMANZIK}};
-
-  CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {
-    {"ape", QUDA_GAUGE_SMEAR_APE}, {"stout", QUDA_GAUGE_SMEAR_STOUT}, {"ovr-imp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}};
+  CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE},
+                                                                {"stout", QUDA_GAUGE_SMEAR_STOUT},
+                                                                {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT},
+                                                                {"hyp", QUDA_GAUGE_SMEAR_HYP},
+                                                                {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW},
+                                                                {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}};
 
   CLI::TransformPairs<QudaSetupType> setup_type_map {{"test", QUDA_TEST_VECTOR_SETUP}, {"null", QUDA_TEST_VECTOR_SETUP}};
 
@@ -766,7 +772,6 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
 
 void add_eigen_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
-
   CLI::QUDACheckedTransformer prec_transform(precision_map);
   // Option group for Eigensolver related options
   auto opgroup = quda_app->add_option_group("Eigensolver", "Options controlling eigensolver");
@@ -1121,9 +1126,16 @@ void add_eofa_option_group(std::shared_ptr<QUDAApp> quda_app)
 
 void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
-
   // Option group for SU(3) related options
   auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests");
+
+  opgroup
+    ->add_option(
+      "--su3-smear-type",
+      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)")
+    ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
+  ;
+
   opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)");
 
   opgroup->add_option("--su3-smear-rho", gauge_smear_rho,
@@ -1131,18 +1143,23 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
 
   opgroup->add_option(
     "--su3-smear-epsilon", gauge_smear_epsilon,
-    "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 1.0)");
+    "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 0.1)");
 
-  opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)");
+  opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)");
+  opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
+  opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
 
-  opgroup->add_option("--su3-wflow-type", wflow_type, "The type of action to use in the wilson flow (default wilson)")
-    ->transform(CLI::QUDACheckedTransformer(wflow_type_map));
+  opgroup->add_option(
+    "--su3-smear-dir-ignore", gauge_smear_dir_ignore,
+    "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)");
 
-  opgroup->add_option("--su3-smear-type", gauge_smear_type, "The type of smearing to use (default stout)")
-    ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
+  opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)");
 
   opgroup->add_option("--su3-measurement-interval", measurement_interval,
-                      "Measure the field energy and topological charge every Nth step (default 5) ");
+                      "Measure the field energy and/or topological charge every Nth step (default 5) ");
+
+  opgroup->add_option("--su3-project", su_project,
+                      "Project smeared gauge onto su3 manifold at measurement interval (default true)");
 }
 
 void add_madwf_option_group(std::shared_ptr<QUDAApp> quda_app)
diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h
index aec9e84c7e..67ae7b7372 100644
--- a/tests/utils/command_line_params.h
+++ b/tests/utils/command_line_params.h
@@ -555,10 +555,14 @@ extern double eofa_mq3;
 extern double gauge_smear_rho;
 extern double gauge_smear_epsilon;
 extern double gauge_smear_alpha;
+extern double gauge_smear_alpha1;
+extern double gauge_smear_alpha2;
+extern double gauge_smear_alpha3;
 extern int gauge_smear_steps;
-extern QudaWFlowType wflow_type;
+extern int gauge_smear_dir_ignore;
 extern int measurement_interval;
 extern QudaGaugeSmearType gauge_smear_type;
+extern bool su_project;
 
 extern double smear_coeff;
 extern int    smear_n_steps;
diff --git a/tests/utils/host_blas.cpp b/tests/utils/host_blas.cpp
index c9d8c63a59..e918f77589 100644
--- a/tests/utils/host_blas.cpp
+++ b/tests/utils/host_blas.cpp
@@ -63,20 +63,20 @@ void mxpy(void *x, void *y, int len, QudaPrecision precision)
 }
 
 // returns the square of the L2 norm of the vector
-template <typename real_t> inline double norm2(real_t *v, int len)
+template <typename real_t> inline double norm2(real_t *v, int len, bool global)
 {
   double sum = 0.0;
   for (int i = 0; i < len; i++) sum += v[i] * v[i];
-  quda::comm_allreduce_sum(sum);
+  if (global) quda::comm_allreduce_sum(sum);
   return sum;
 }
 
-double norm_2(void *v, int len, QudaPrecision precision)
+double norm_2(void *v, int len, QudaPrecision precision, bool global)
 {
   if (precision == QUDA_DOUBLE_PRECISION)
-    return norm2((double *)v, len);
+    return norm2((double *)v, len, global);
   else
-    return norm2((float *)v, len);
+    return norm2((float *)v, len, global);
 }
 
 // performs the operation y[i] = x[i] + a*y[i]
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index c63d124dc8..473a527ac5 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -250,7 +250,7 @@ void exchange_llfat_init(QudaPrecision prec);
 void exchange_llfat_cleanup(void);
 
 // Implemented in host_blas.cpp
-double norm_2(void *vector, int len, QudaPrecision precision);
+double norm_2(void *vector, int len, QudaPrecision precision, bool global = true);
 void mxpy(void *x, void *y, int len, QudaPrecision precision);
 void ax(double a, void *x, int len, QudaPrecision precision);
 void cax(double _Complex a, void *x, int len, QudaPrecision precision);
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index 8ec29c514b..58cc3fb758 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -221,6 +221,7 @@ const char *get_eig_type_str(QudaEigType type)
   switch (type) {
   case QUDA_EIG_TR_LANCZOS: ret = "trlm"; break;
   case QUDA_EIG_BLK_TR_LANCZOS: ret = "blktrlm"; break;
+  case QUDA_EIG_TR_LANCZOS_3D: ret = "trlm_3d"; break;
   case QUDA_EIG_IR_ARNOLDI: ret = "iram"; break;
   case QUDA_EIG_BLK_IR_ARNOLDI: ret = "blkiram"; break;
   default: ret = "unknown eigensolver"; break;
diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp
index c6aa1218e9..5255b18315 100644
--- a/tests/utils/set_params.cpp
+++ b/tests/utils/set_params.cpp
@@ -5,12 +5,16 @@
 
 void setGaugeSmearParam(QudaGaugeSmearParam &smear_param)
 {
+  smear_param.smear_type = gauge_smear_type;
   smear_param.alpha = gauge_smear_alpha;
   smear_param.rho = gauge_smear_rho;
   smear_param.epsilon = gauge_smear_epsilon;
   smear_param.n_steps = gauge_smear_steps;
   smear_param.meas_interval = measurement_interval;
-  smear_param.smear_type = gauge_smear_type;
+  smear_param.alpha1 = gauge_smear_alpha1;
+  smear_param.alpha2 = gauge_smear_alpha2;
+  smear_param.alpha3 = gauge_smear_alpha3;
+  smear_param.dir_ignore = gauge_smear_dir_ignore;
   smear_param.struct_size = sizeof(smear_param);
 }
 
@@ -125,16 +129,17 @@ void setInvertParam(QudaInvertParam &inv_param)
 {
   // Set dslash type
   inv_param.dslash_type = dslash_type;
+  int dimension = laplace3D < 4 ? 3 : 4;
 
   // Use kappa or mass normalisation
   if (kappa == -1.0) {
     inv_param.mass = mass;
     inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass));
-    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (8 + mass);
+    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (2 * dimension + mass);
   } else {
     inv_param.kappa = kappa;
     inv_param.mass = 0.5 / kappa - (1.0 + 3.0 / anisotropy);
-    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 8.0;
+    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 2 * dimension;
   }
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
     printfQuda("Kappa = %.8f Mass = %.8f\n", inv_param.kappa, inv_param.mass);
@@ -936,7 +941,8 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
   // Solver params
   inv_param.verbosity = verbosity;
   inv_param.mass = mass;
-  inv_param.kappa = kappa = 1.0 / (8.0 + mass); // for Laplace operator
+  int dimension = laplace3D < 4 ? 3 : 4;
+  inv_param.kappa = 1.0 / (2 * dimension + mass); // for Laplace operator
   inv_param.laplace3D = laplace3D;              // for Laplace operator
 
   if (Nsrc < Nsrc_tile || Nsrc % Nsrc_tile != 0)
@@ -1003,7 +1009,7 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
   inv_param.solve_type = solve_type;
   inv_param.matpc_type = matpc_type;
   inv_param.dagger = QUDA_DAG_NO;
-  inv_param.mass_normalization = QUDA_MASS_NORMALIZATION;
+  inv_param.mass_normalization = dslash_type == QUDA_LAPLACE_DSLASH ? QUDA_KAPPA_NORMALIZATION : QUDA_MASS_NORMALIZATION;
 
   inv_param.cpu_prec = cpu_prec;
   inv_param.cuda_prec = prec;