From b22beda16259eef5af67c920d9b72f24121dd856 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 17 Aug 2023 14:24:39 -0400
Subject: [PATCH 01/53] Misc things I noticed elsewhere

---
 lib/inv_ca_gcr.cpp         | 3 +++
 tests/utils/set_params.cpp | 9 ++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp
index f9e605ea86..feb8555334 100644
--- a/lib/inv_ca_gcr.cpp
+++ b/lib/inv_ca_gcr.cpp
@@ -146,6 +146,7 @@ namespace quda
     create(x, b);
 
     if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_PREAMBLE);
+    if (param.is_preconditioner) commGlobalReductionPush(param.global_reduction);
 
     // compute b2, but only if we need to
     bool fixed_iteration = param.sloppy_converge && n_krylov == param.maxiter && !param.compute_true_res;
@@ -397,6 +398,8 @@ namespace quda
     }
 
     PrintSummary("CA-GCR", total_iter, r2, b2, stop, param.tol_hq);
+
+    if (param.is_preconditioner) commGlobalReductionPop();
   }
 
 } // namespace quda
diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp
index b862c3ced4..62cd9e8c90 100644
--- a/tests/utils/set_params.cpp
+++ b/tests/utils/set_params.cpp
@@ -937,9 +937,11 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
 
   // domain decomposition preconditioner parameters
   inv_param.inv_type_precondition = precon_type;
+  inv_param.schwarz_type = precon_schwarz_type;
+  inv_param.precondition_cycle = precon_schwarz_cycle;
   inv_param.tol_precondition = tol_precondition;
   inv_param.maxiter_precondition = maxiter_precondition;
-  inv_param.verbosity_precondition = QUDA_SILENT;
+  inv_param.verbosity_precondition = verbosity_precondition;
   inv_param.cuda_prec_precondition = prec_precondition;
   inv_param.cuda_prec_eigensolver = prec_eigensolver;
 
@@ -952,6 +954,11 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
   inv_param.ca_lambda_min = ca_lambda_min;
   inv_param.ca_lambda_max = ca_lambda_max;
 
+  // Set preconditioner CA info
+  inv_param.ca_basis_precondition = ca_basis_precondition;
+  inv_param.ca_lambda_min_precondition = ca_lambda_min_precondition;
+  inv_param.ca_lambda_max_precondition = ca_lambda_max_precondition;
+
   inv_param.solution_type = solution_type;
   inv_param.solve_type = solve_type;
   inv_param.matpc_type = matpc_type;

From b78720ec1f9f7dd1e0bdae5a3582c311abf37463 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 17 Aug 2023 14:41:20 -0400
Subject: [PATCH 02/53] Removed argc/argv hacks because they are not used under
 the hood anyway

---
 tests/staggered_dslash_ctest.cpp    | 13 +------------
 tests/staggered_dslash_test.cpp     | 13 +------------
 tests/staggered_dslash_test_utils.h | 16 ++++++----------
 3 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index 8505b04fe6..bb20115554 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -2,9 +2,6 @@
 
 using namespace quda;
 
-// For loading the gauge fields
-int argc_copy;
-char **argv_copy;
 bool ctest_all_partitions = false;
 
 using ::testing::Bool;
@@ -77,7 +74,7 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
     }
     updateR();
 
-    dslash_test_wrapper.init_ctest(argc_copy, argv_copy, prec, recon);
+    dslash_test_wrapper.init_ctest(prec, recon);
     display_test_info(prec, recon);
   }
 
@@ -124,14 +121,6 @@ int main(int argc, char **argv)
 
   initComms(argc, argv, gridsize_from_cmdline);
 
-  // The 'SetUp()' method of the Google Test class from which DslashTest
-  // in derived has no arguments, but QUDA's implementation requires the
-  // use of argc and argv to set up the test via the function 'init'.
-  // As a workaround, we declare argc_copy and argv_copy as global pointers
-  // so that they are visible inside the 'init' function.
-  argc_copy = argc;
-  argv_copy = argv;
-
   // Ensure gtest prints only from rank 0
   ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index f61e62d88c..936d457158 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -2,9 +2,6 @@
 
 using namespace quda;
 
-int argc_copy;
-char **argv_copy;
-
 class StaggeredDslashTest : public ::testing::Test
 {
 protected:
@@ -26,7 +23,7 @@ class StaggeredDslashTest : public ::testing::Test
 
   virtual void SetUp()
   {
-    dslash_test_wrapper.init_test(argc_copy, argv_copy);
+    dslash_test_wrapper.init_test();
     display_test_info();
   }
 
@@ -72,14 +69,6 @@ int main(int argc, char **argv)
 
   initComms(argc, argv, gridsize_from_cmdline);
 
-  // The 'SetUp()' method of the Google Test class from which DslashTest
-  // in derived has no arguments, but QUDA's implementation requires the
-  // use of argc and argv to set up the test via the function 'init'.
-  // As a workaround, we declare argc_copy and argv_copy as global pointers
-  // so that they are visible inside the 'init' function.
-  argc_copy = argc;
-  argv_copy = argv;
-
   // Ensure gtest prints only from rank 0
   ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 5c6d885673..0c8f434031 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -74,10 +74,6 @@ struct StaggeredDslashTestWrapper {
 
   Dirac *dirac;
 
-  // For loading the gauge fields
-  int argc_copy;
-  char **argv_copy;
-
   // Split grid options
   bool test_split_grid = false;
   int num_src = 1;
@@ -111,7 +107,7 @@ struct StaggeredDslashTestWrapper {
     }
   }
 
-  void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon_)
+  void init_ctest(int precision, QudaReconstructType link_recon_)
   {
     gauge_param = newQudaGaugeParam();
     inv_param = newQudaInvertParam();
@@ -131,10 +127,10 @@ struct StaggeredDslashTestWrapper {
 
     link_recon = link_recon_;
 
-    init(argc, argv);
+    init();
   }
 
-  void init_test(int argc, char **argv)
+  void init_test()
   {
     gauge_param = newQudaGaugeParam();
     inv_param = newQudaInvertParam();
@@ -142,10 +138,10 @@ struct StaggeredDslashTestWrapper {
     setStaggeredGaugeParam(gauge_param);
     setStaggeredInvertParam(inv_param);
 
-    init(argc, argv);
+    init();
   }
 
-  void init(int argc, char **argv)
+  void init()
   {
     inv_param.split_grid[0] = grid_partition[0];
     inv_param.split_grid[1] = grid_partition[1];
@@ -187,7 +183,7 @@ struct StaggeredDslashTestWrapper {
 
     bool gauge_loaded = false;
     constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu,
-                                           qdp_fatlink_gpu, gauge_param, argc, argv, gauge_loaded);
+                                           qdp_fatlink_gpu, gauge_param, 0, nullptr, gauge_loaded);
 
     // Alright, we've created all the void** links.
     // Create the void* pointers

From b2fc275b962601b490a2c3f2547804dfbd911429 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 18 Aug 2023 16:49:20 -0400
Subject: [PATCH 03/53] Cleaned up staggered dslash (c)test, enabled support
 for testing half links/other missing things

---
 tests/staggered_dslash_ctest.cpp      |  33 ++---
 tests/staggered_dslash_test.cpp       |  32 ++---
 tests/staggered_dslash_test_utils.h   | 197 +++++++++++---------------
 tests/staggered_eigensolve_test.cpp   |   4 +-
 tests/staggered_invert_test.cpp       |   4 +-
 tests/utils/host_utils.h              |   2 +-
 tests/utils/staggered_gauge_utils.cpp |   6 +-
 tests/utils/staggered_host_utils.cpp  |  15 +-
 8 files changed, 127 insertions(+), 166 deletions(-)

diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index bb20115554..ceffb74bb8 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -24,17 +24,6 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
       return true;
     }
 
-    if (dslash_type == QUDA_ASQTAD_DSLASH && compute_fatlong
-        && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) {
-      warningQuda("Fixed precision unsupported in fat/long compute, skipping...");
-      return true;
-    }
-
-    if (dslash_type == QUDA_ASQTAD_DSLASH && compute_fatlong && (getReconstructNibble(recon) & 1)) {
-      warningQuda("Reconstruct 9 unsupported in fat/long compute, skipping...");
-      return true;
-    }
-
     if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) {
       warningQuda("Fixed precision unsupported for Laplace operator, skipping...");
       return true;
@@ -100,6 +89,10 @@ TEST_P(StaggeredDslashTest, verify)
   double deviation = dslash_test_wrapper.verify();
   double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec);
 
+  if (dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9
+      && dslash_test_wrapper.inv_param.cuda_prec >= QUDA_HALF_PRECISION)
+    tol *= 10; // if recon 9, we tolerate a greater deviation
+
   ASSERT_LE(deviation, tol) << "Reference CPU and QUDA implementations do not agree";
 }
 
@@ -125,13 +118,9 @@ int main(int argc, char **argv)
   ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
 
-  // Only these fermions are supported in this file. Ensure a reasonable default,
-  // ensure that the default is improved staggered
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
-    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
-               get_dslash_str(QUDA_ASQTAD_DSLASH));
-    dslash_type = QUDA_ASQTAD_DSLASH;
-  }
+  // Only these fermions are supported in this file
+  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't
   // ask to build the fat/long links... it doesn't make sense.
@@ -156,12 +145,8 @@ int main(int argc, char **argv)
     }
   }
 
-  if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    if (dtest_type != dslash_test_type::Mat) {
-      errorQuda("Test type %s is not supported for the Laplace operator.\n",
-                get_string(dtest_type_map, dtest_type).c_str());
-    }
-  }
+  if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat)
+    errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str());
 
   int test_rc = RUN_ALL_TESTS();
 
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index 936d457158..7c3524dacf 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -48,6 +48,12 @@ TEST_F(StaggeredDslashTest, verify)
 
   double deviation = dslash_test_wrapper.verify();
   double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec);
+
+  // give it a tiny bump for fixed precision, recon 8
+  if (dslash_test_wrapper.inv_param.cuda_prec <= QUDA_HALF_PRECISION &&
+      dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9)
+    tol *= 1.1;
+
   ASSERT_LE(deviation, tol) << "reference and QUDA implementations do not agree";
 }
 
@@ -56,6 +62,9 @@ int main(int argc, char **argv)
   // initalize google test
   ::testing::InitGoogleTest(&argc, argv);
 
+  // override the default dslash from Wilson
+  dslash_type = QUDA_ASQTAD_DSLASH;
+
   // command line options
   auto app = make_app();
   app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map));
@@ -73,13 +82,9 @@ int main(int argc, char **argv)
   ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
 
-  // Only these fermions are supported in this file. Ensure a reasonable default,
-  // ensure that the default is improved staggered
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
-    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
-               get_dslash_str(QUDA_ASQTAD_DSLASH));
-    dslash_type = QUDA_ASQTAD_DSLASH;
-  }
+  // Only these fermions are supported in this file
+  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash,
   // and don't ask to build the fat/long links... it doesn't make sense.
@@ -103,17 +108,8 @@ int main(int argc, char **argv)
     }
   }
 
-  if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    if (dtest_type != dslash_test_type::Mat) {
-      errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str());
-    }
-  }
-
-  // If we're building fat/long links, there are some
-  // tests we have to skip.
-  if (dslash_type == QUDA_ASQTAD_DSLASH && compute_fatlong) {
-    if (prec < QUDA_SINGLE_PRECISION) { errorQuda("Fixed-point precision unsupported in fat/long compute"); }
-  }
+  if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat)
+    errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str());
 
   int test_rc = RUN_ALL_TESTS();
 
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 0c8f434031..65cbd21dfa 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -44,17 +44,9 @@ struct DslashTime {
 
 struct StaggeredDslashTestWrapper {
 
-  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
-
   QudaGaugeParam gauge_param;
   QudaInvertParam inv_param;
 
-  void *milc_fatlink_gpu;
-  void *milc_longlink_gpu;
-
-  cpuGaugeField *cpuFat = nullptr;
-  cpuGaugeField *cpuLong = nullptr;
-
   ColorSpinorField spinor;
   ColorSpinorField spinorOut;
   ColorSpinorField spinorRef;
@@ -65,10 +57,13 @@ struct StaggeredDslashTestWrapper {
   std::vector<ColorSpinorField> vp_spinor;
   std::vector<ColorSpinorField> vp_spinor_out;
 
-  // In the HISQ case, we include building fat/long links in this unit test
-  void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
-  void **ghost_fatlink_cpu = nullptr, **ghost_longlink_cpu = nullptr;
+  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *milc_fatlink = nullptr;
+  void *milc_longlink = nullptr;
+  GaugeField *cpuFat = nullptr;
+  GaugeField *cpuLong = nullptr;
 
   QudaParity parity = QUDA_EVEN_PARITY;
 
@@ -78,24 +73,27 @@ struct StaggeredDslashTestWrapper {
   bool test_split_grid = false;
   int num_src = 1;
 
+  // Whether or not we need the ghost zones
+  bool need_ghost_zone = false;
+
   void staggeredDslashRef()
   {
     // compare to dslash reference implementation
     printfQuda("Calculating reference implementation...");
     switch (dtest_type) {
     case dslash_test_type::Dslash:
-      staggeredDslash(spinorRef, qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, spinor,
+      staggeredDslash(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor,
                       parity, dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
       break;
     case dslash_test_type::MatPC:
-      staggeredMatDagMat(spinorRef, qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, spinor,
+      staggeredMatDagMat(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor,
                          mass, 0, inv_param.cpu_prec, gauge_param.cpu_prec, tmpCpu, parity, dslash_type);
       break;
     case dslash_test_type::Mat:
       // the !dagger is to reconcile the QUDA convention of D_stag = {{ 2m, -D_{eo}}, -D_{oe}, 2m}} vs the host convention without the minus signs
-      staggeredDslash(spinorRef.Even(), qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu,
+      staggeredDslash(spinorRef.Even(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(),
                       spinor.Odd(), QUDA_EVEN_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
-      staggeredDslash(spinorRef.Odd(), qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu,
+      staggeredDslash(spinorRef.Odd(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(),
                       spinor.Even(), QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
       if (dslash_type == QUDA_LAPLACE_DSLASH) {
         xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
@@ -116,7 +114,6 @@ struct StaggeredDslashTestWrapper {
     setStaggeredInvertParam(inv_param);
 
     auto prec = getPrecision(precision);
-    setVerbosity(QUDA_SUMMARIZE);
 
     gauge_param.cuda_prec = prec;
     gauge_param.cuda_prec_sloppy = prec;
@@ -161,88 +158,53 @@ struct StaggeredDslashTestWrapper {
       Nsrc = 1;
     }
 
-    // Allocate a lot of memory because I'm very confused
-    void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-    void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-
-    milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-    milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-
-    void *qdp_fatlink_gpu[4];
-    void *qdp_longlink_gpu[4];
-
+    // Allocate fields
     for (int dir = 0; dir < 4; dir++) {
       qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
-      qdp_fatlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      qdp_longlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
-      qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_fatlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_longlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
     }
+    milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
-    bool gauge_loaded = false;
-    constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu,
-                                           qdp_fatlink_gpu, gauge_param, 0, nullptr, gauge_loaded);
-
-    // Alright, we've created all the void** links.
-    // Create the void* pointers
-    reorderQDPtoMILC(milc_fatlink_gpu, qdp_fatlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    // Create ghost zones for CPU fields,
-    // prepare and load the GPU fields
-
-#ifdef MULTI_GPU
-    gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS;
+    // For load, etc
     gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
-    GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink_cpu);
-    cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-    cpuFat = new cpuGaugeField(cpuFatParam);
-    ghost_fatlink_cpu = cpuFat->Ghost();
-
-    if (dslash_type == QUDA_ASQTAD_DSLASH) {
-      gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
-      GaugeFieldParam cpuLongParam(gauge_param, milc_longlink_cpu);
-      cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-      cpuLong = new cpuGaugeField(cpuLongParam);
-      ghost_longlink_cpu = cpuLong ? cpuLong->Ghost() : nullptr;
-    }
-#endif
-
-    gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS;
-    if (dslash_type == QUDA_STAGGERED_DSLASH) {
-      gauge_param.reconstruct = gauge_param.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ?
-                                             QUDA_RECONSTRUCT_13 :
-        (link_recon == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_9 :
-                                             link_recon;
-    } else {
-      gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
-    }
 
-    // set verbosity prior to loadGaugeQuda
-    setVerbosity(verbosity);
+    // Dummy arg needed because other tests load the gauge field more than once
+    bool gauge_loaded = false;
+    constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, gauge_loaded);
+    // Reorder gauge fields to MILC order
+    reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+
+    // Create ghost gauge fields in case of multi GPU builds.
+    gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ?
+      QUDA_SU3_LINKS :
+      QUDA_ASQTAD_FAT_LINKS;
+    gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
+    gauge_param.location = QUDA_CPU_FIELD_LOCATION;
 
-    printfQuda("Sending fat links to GPU\n");
-    loadGaugeQuda(milc_fatlink_gpu, &gauge_param);
+    GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink);
+    cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
+    cpuFat = GaugeField::Create(cpuFatParam);
 
     gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
-
-#ifdef MULTI_GPU
-    gauge_param.ga_pad *= 3;
-#endif
-
-    if (dslash_type == QUDA_ASQTAD_DSLASH) {
-      gauge_param.staggered_phase_type = QUDA_STAGGERED_PHASE_NO;
-      gauge_param.reconstruct = gauge_param.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ?
-                                             QUDA_RECONSTRUCT_13 :
-        (link_recon == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_9 :
-                                             link_recon;
-      printfQuda("Sending long links to GPU\n");
-      loadGaugeQuda(milc_longlink_gpu, &gauge_param);
+    GaugeFieldParam cpuLongParam(gauge_param, milc_longlink);
+    cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
+    cpuLong = GaugeField::Create(cpuLongParam);
+
+    // Override link reconstruct as appropriate for staggered or asqtad
+    if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) {
+      if (link_recon == QUDA_RECONSTRUCT_12) link_recon = QUDA_RECONSTRUCT_13;
+      if (link_recon == QUDA_RECONSTRUCT_8) link_recon = QUDA_RECONSTRUCT_9;
     }
 
+    loadFatLongGaugeQuda(milc_fatlink, milc_longlink, gauge_param);
+
+    // reset the reconstruct in gauge param
+    gauge_param.reconstruct = link_recon;
+
+    // Create an appropriate host ColorSpinorParam
     ColorSpinorParam csParam;
     csParam.nColor = 3;
     csParam.nSpin = 1;
@@ -251,7 +213,6 @@ struct StaggeredDslashTestWrapper {
     csParam.x[4] = 1;
 
     csParam.setPrecision(inv_param.cpu_prec);
-    // inv_param.solution_type = QUDA_MAT_SOLUTION;
     csParam.pad = 0;
     if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) {
       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
@@ -298,39 +259,40 @@ struct StaggeredDslashTestWrapper {
     DiracParam diracParam;
     setDiracParam(diracParam, &inv_param, pc);
     dirac = Dirac::create(diracParam);
-
-    for (int dir = 0; dir < 4; dir++) {
-      host_free(qdp_fatlink_gpu[dir]);
-      host_free(qdp_longlink_gpu[dir]);
-      host_free(qdp_inlink[dir]);
-    }
-    host_free(milc_fatlink_cpu);
-    host_free(milc_longlink_cpu);
   }
 
   void end()
   {
     for (int dir = 0; dir < 4; dir++) {
-      if (qdp_fatlink_cpu[dir] != nullptr) {
-        host_free(qdp_fatlink_cpu[dir]);
-        qdp_fatlink_cpu[dir] = nullptr;
+      if (qdp_inlink[dir] != nullptr) {
+        host_free(qdp_inlink[dir]);
+        qdp_inlink[dir] = nullptr;
+      }
+      if (qdp_fatlink[dir] != nullptr) {
+        host_free(qdp_fatlink[dir]);
+        qdp_fatlink[dir] = nullptr;
       }
-      if (qdp_longlink_cpu[dir] != nullptr) {
-        host_free(qdp_longlink_cpu[dir]);
-        qdp_longlink_cpu[dir] = nullptr;
+      if (qdp_longlink[dir] != nullptr) {
+        host_free(qdp_longlink[dir]);
+        qdp_longlink[dir] = nullptr;
       }
     }
 
+    if (milc_fatlink) {
+      host_free(milc_fatlink);
+      milc_fatlink = nullptr;
+    }
+
+    if (milc_longlink) {
+      host_free(milc_longlink);
+      milc_longlink = nullptr;
+    }
+
     if (dirac != nullptr) {
       delete dirac;
       dirac = nullptr;
     }
 
-    host_free(milc_fatlink_gpu);
-    milc_fatlink_gpu = nullptr;
-    host_free(milc_longlink_gpu);
-    milc_longlink_gpu = nullptr;
-
     freeGaugeQuda();
 
     if (cpuFat) {
@@ -362,7 +324,7 @@ struct StaggeredDslashTestWrapper {
         _hp_x[i] = vp_spinor_out[i].V();
         _hp_b[i] = vp_spinor[i].V();
       }
-      dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink_gpu, milc_longlink_gpu,
+      dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, qdp_fatlink, qdp_longlink,
                                   &gauge_param);
 
     } else {
@@ -371,11 +333,18 @@ struct StaggeredDslashTestWrapper {
 
         host_timer.start();
 
-        switch (dtest_type) {
-        case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break;
-        case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break;
-        case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break;
-        default: errorQuda("Test type %d not defined on staggered dslash", static_cast<int>(dtest_type));
+        if (dslash_type == QUDA_LAPLACE_DSLASH) {
+          switch (dtest_type) {
+          case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break;
+          default: errorQuda("Test type %d not defined on Laplace operator", static_cast<int>(dtest_type));
+          }
+        } else {
+          switch (dtest_type) {
+          case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break;
+          case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break;
+          case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break;
+          default: errorQuda("Test type %d not defined on staggered dslash", static_cast<int>(dtest_type));
+          }
         }
 
         host_timer.stop();
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 70877d36d2..911d58a2f9 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -132,7 +132,9 @@ int main(int argc, char **argv)
   milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
   milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
-  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
+  // Dummy arg needed because other tests load the gauge field more than once
+  bool gauge_loaded = false;
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, gauge_loaded);
 
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
   double plaq[3];
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 27d752f1b6..4e2481da84 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -234,7 +234,9 @@ int main(int argc, char **argv)
   // For load, etc
   gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
 
-  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
+  // Dummy arg needed because other tests load the gauge field more than once
+  bool gauge_loaded = false;
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, gauge_loaded);
   // Reorder gauge fields to MILC order
   reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
   reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 66c46fb5ea..1d2692b25e 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -51,7 +51,7 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli
                                             void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
                                             int argc, char **argv, bool &gauge_loaded);
 void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
-                                      QudaGaugeParam &gauge_param, int argc, char **argv);
+                                      QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded);
 void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *,
                                 QudaDslashType dslash_type);
 void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param);
diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp
index 2759e3489b..e9e18948d4 100644
--- a/tests/utils/staggered_gauge_utils.cpp
+++ b/tests/utils/staggered_gauge_utils.cpp
@@ -26,8 +26,12 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
                          void **qdp_inlink, QudaGaugeParam &gauge_param_in, double **act_path_coeffs, double eps_naik,
                          size_t gSize, int n_naiks)
 {
-  // since a lot of intermediaries can be general matrices, override the recon in `gauge_param_in`
+  // Intermediates can be general matrices, so override the reconstruct.
+  // Similarly, gauge links can only be built in single or double, so upscale the build precision
+  // if neccessary.
   auto gauge_param = gauge_param_in;
+  if (gauge_param.cuda_prec < QUDA_SINGLE_PRECISION)
+    gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
   gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
   gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; // probably irrelevant
 
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index 021bbd6877..4694aa829a 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -75,16 +75,19 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli
 }
 
 void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
-                                      QudaGaugeParam &gauge_param, int argc, char **argv)
+                                      QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded)
 {
   gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
 
   if (latfile.size() > 0) {
-    // load in the command line supplied gauge field using QIO and LIME
-    read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
-    if (dslash_type != QUDA_LAPLACE_DSLASH) {
-      applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
-    }
+    if (!gauge_loaded) {
+      // load in the command line supplied gauge field using QIO and LIME
+      read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
+      if (dslash_type != QUDA_LAPLACE_DSLASH) {
+        applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
+      }
+      gauge_loaded = true;
+    } // else gauge already loaded
   } else {
     int construct_type = (unit_gauge) ? 0 : 1;
     if (dslash_type == QUDA_LAPLACE_DSLASH) {

From cc31020ebda149ad98aa596e2cd1cf9ffa0d28e4 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 29 Aug 2023 18:27:56 -0400
Subject: [PATCH 04/53] Various cleanup of gauge fields in staggered test exes

---
 tests/host_reference/dslash_reference.cpp     | 12 +--
 tests/host_reference/dslash_reference.h       |  3 +-
 .../staggered_dslash_reference.cpp            | 93 ++++++++-----------
 .../staggered_dslash_reference.h              | 19 ++--
 tests/staggered_dslash_test_utils.h           | 18 ++--
 tests/staggered_invert_test.cpp               | 12 +--
 6 files changed, 68 insertions(+), 89 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 9fc53fe6bf..63ed621c80 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -744,8 +744,7 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 }
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[],
-                                void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param,
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param,
                                 QudaInvertParam &inv_param, int shift)
 {
   switch (test_type) {
@@ -757,10 +756,8 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
     // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not
     // have the minus sign. Passing in QUDA_DAG_YES solves this
     // discrepancy.
-    staggeredDslash(ref.Even(), qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out.Odd(), QUDA_EVEN_PARITY,
-                    QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
-    staggeredDslash(ref.Odd(), qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out.Even(), QUDA_ODD_PARITY,
-                    QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
+    staggeredDslash(ref.Even(), fat_link, long_link, out.Odd(), QUDA_EVEN_PARITY, QUDA_DAG_YES, dslash_type);
+    staggeredDslash(ref.Odd(), fat_link, long_link, out.Even(), QUDA_ODD_PARITY, QUDA_DAG_YES, dslash_type);
 
     if (dslash_type == QUDA_LAPLACE_DSLASH) {
       xpay(out.V(), kappa, ref.V(), ref.Length(), gauge_param.cpu_prec);
@@ -775,8 +772,7 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
   case 5: // multi mass CG, even parity solution, solving EVEN system
   case 6: // multi mass CG, odd parity solution, solving ODD system
 
-    staggeredMatDagMat(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec,
-                       gauge_param.cpu_prec, tmp,
+    staggeredMatDagMat(ref, fat_link, long_link, out, mass, 0, tmp,
                        (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type);
     break;
   }
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 44392628c2..42f90fed91 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -109,8 +109,7 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[],
-                                void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param,
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param,
                                 QudaInvertParam &inv_param, int shift);
 
 // i represents a "half index" into an even or odd "half lattice".
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 86ecd17464..24a2932078 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -11,6 +11,7 @@
 #include <command_line_params.h>
 #include "misc.h"
 #include <blas_quda.h>
+#include <gauge_field.h>
 
 #include <dslash_reference.h>
 
@@ -32,24 +33,24 @@ template <typename Float> void display_link_internal(Float *link)
 // if oddBit is one:  calculate odd parity spinor elements
 // if daggerBit is zero: perform ordinary dslash operator
 // if daggerBit is one:  perform hermitian conjugate of dslash
-template <typename sFloat, typename gFloat>
+template <typename real_t>
 #ifdef MULTI_GPU
-void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, gFloat **ghostFatlink,
-                              gFloat **ghostLonglink, sFloat *spinorField, sFloat **fwd_nbr_spinor,
-                              sFloat **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type)
+void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink,
+                              real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor,
+                              real_t **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type)
 #else
-void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, gFloat **, gFloat **, sFloat *spinorField,
-                              sFloat **, sFloat **, int oddBit, int daggerBit, QudaDslashType dslash_type)
+void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **, real_t **, real_t *spinorField,
+                              real_t **, real_t **, int oddBit, int daggerBit, QudaDslashType dslash_type)
 #endif
 {
   for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0;
 
-  gFloat *fatlinkEven[4], *fatlinkOdd[4];
-  gFloat *longlinkEven[4], *longlinkOdd[4];
+  real_t *fatlinkEven[4], *fatlinkOdd[4];
+  real_t *longlinkEven[4], *longlinkOdd[4];
 
 #ifdef MULTI_GPU
-  gFloat *ghostFatlinkEven[4], *ghostFatlinkOdd[4];
-  gFloat *ghostLonglinkEven[4], *ghostLonglinkOdd[4];
+  real_t *ghostFatlinkEven[4], *ghostFatlinkOdd[4];
+  real_t *ghostLonglinkEven[4], *ghostLonglinkOdd[4];
 #endif
 
   for (int dir = 0; dir < 4; dir++) {
@@ -72,28 +73,28 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
     for (int dir = 0; dir < 8; dir++) {
 #ifdef MULTI_GPU
       const int nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1;
-      gFloat *fatlnk
+      real_t *fatlnk
         = gaugeLink_mg4dir(sid, dir, oddBit, fatlinkEven, fatlinkOdd, ghostFatlinkEven, ghostFatlinkOdd, 1, 1);
-      gFloat *longlnk = dslash_type == QUDA_ASQTAD_DSLASH ?
+      real_t *longlnk = dslash_type == QUDA_ASQTAD_DSLASH ?
         gaugeLink_mg4dir(sid, dir, oddBit, longlinkEven, longlinkOdd, ghostLonglinkEven, ghostLonglinkOdd, 3, 3) :
         nullptr;
-      sFloat *first_neighbor_spinor = spinorNeighbor_5d_mgpu<QUDA_4D_PC>(
+      real_t *first_neighbor_spinor = spinorNeighbor_5d_mgpu<QUDA_4D_PC>(
         sid, dir, oddBit, spinorField, fwd_nbr_spinor, back_nbr_spinor, 1, nFace, stag_spinor_site_size);
-      sFloat *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ?
+      real_t *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ?
         spinorNeighbor_5d_mgpu<QUDA_4D_PC>(sid, dir, oddBit, spinorField, fwd_nbr_spinor, back_nbr_spinor, 3, nFace,
                                            stag_spinor_site_size) :
         nullptr;
 #else
-      gFloat *fatlnk = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, 1);
-      gFloat *longlnk
+      real_t *fatlnk = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, 1);
+      real_t *longlnk
         = dslash_type == QUDA_ASQTAD_DSLASH ? gaugeLink(sid, dir, oddBit, longlinkEven, longlinkOdd, 3) : nullptr;
-      sFloat *first_neighbor_spinor
+      real_t *first_neighbor_spinor
         = spinorNeighbor_5d<QUDA_4D_PC>(sid, dir, oddBit, spinorField, 1, stag_spinor_site_size);
-      sFloat *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ?
+      real_t *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ?
         spinorNeighbor_5d<QUDA_4D_PC>(sid, dir, oddBit, spinorField, 3, stag_spinor_site_size) :
         nullptr;
 #endif
-      sFloat gaugedSpinor[stag_spinor_site_size];
+      real_t gaugedSpinor[stag_spinor_site_size];
 
       if (dir % 2 == 0) {
         su3Mul(gaugedSpinor, fatlnk, first_neighbor_spinor);
@@ -122,10 +123,12 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
   } // 4-d volume
 }
 
-void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                     void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
-                     QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
+void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+                     const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type)
 {
+  // assert sPrecision and gPrecision must be the same
+  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
+
   QudaParity otherparity = QUDA_INVALID_PARITY;
   if (oddBit == QUDA_EVEN_PARITY) {
     otherparity = QUDA_ODD_PARITY;
@@ -141,36 +144,24 @@ void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, voi
   void **fwd_nbr_spinor = in.fwdGhostFaceBuffer;
   void **back_nbr_spinor = in.backGhostFaceBuffer;
 
-  if (sPrecision == QUDA_DOUBLE_PRECISION) {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      staggeredDslashReference((double *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
-                               (double **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor,
-                               (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
-    } else {
-      staggeredDslashReference((double *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
-                               (float **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor,
-                               (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
-    }
-  } else {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      staggeredDslashReference((float *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
-                               (double **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor,
-                               (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
-    } else {
-      staggeredDslashReference((float *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
-                               (float **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor,
-                               (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
-    }
+  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
+    staggeredDslashReference((double *)out.V(), (double **)fat_link.Gauge_p(), (double **)long_link.Gauge_p(),
+                             (double **)fat_link.Ghost(), (double **)long_link.Ghost(),
+                             (double *)in.V(), (double **)fwd_nbr_spinor,
+                             (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
+  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
+    staggeredDslashReference((float *)out.V(), (float **)fat_link.Gauge_p(), (float **)long_link.Gauge_p(),
+                             (float **)fat_link.Ghost(), (float **)long_link.Ghost(),
+                             (float *)in.V(), (float **)fwd_nbr_spinor,
+                             (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
   }
 }
 
-void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                        void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
-                        QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity,
-                        QudaDslashType dslash_type)
+void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit,
+                        ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
-  if (sPrecision != gPrecision) { errorQuda("Spinor precision and gPrecison is not the same"); }
+  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); }
 
   QudaParity otherparity = QUDA_INVALID_PARITY;
   if (parity == QUDA_EVEN_PARITY) {
@@ -181,14 +172,12 @@ void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink,
     errorQuda("full parity not supported in function");
   }
 
-  staggeredDslash(tmp, fatlink, longlink, ghost_fatlink, ghost_longlink, in, otherparity, dagger_bit, sPrecision,
-                  gPrecision, dslash_type);
+  staggeredDslash(tmp, fat_link, long_link, in, otherparity, dagger_bit, dslash_type);
 
-  staggeredDslash(out, fatlink, longlink, ghost_fatlink, ghost_longlink, tmp, parity, dagger_bit, sPrecision,
-                  gPrecision, dslash_type);
+  staggeredDslash(out, fat_link, long_link, tmp, parity, dagger_bit, dslash_type);
 
   double msq_x4 = mass * mass * 4;
-  if (sPrecision == QUDA_DOUBLE_PRECISION) {
+  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
     axmy((double *)in.V(), (double)msq_x4, (double *)out.V(), Vh * stag_spinor_site_size);
   } else {
     axmy((float *)in.V(), (float)msq_x4, (float *)out.V(), Vh * stag_spinor_site_size);
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 54d40fdc0d..4a473c114d 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -11,16 +11,13 @@ using namespace quda;
 
 void setDims(int *);
 
-template <typename sFloat, typename gFloat>
-void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, gFloat **ghostFatlink,
-                              gFloat **ghostLonglink, sFloat *spinorField, sFloat **fwd_nbr_spinor,
-                              sFloat **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type);
+template <typename real_t>
+void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink,
+                              real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor,
+                              real_t **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type);
 
-void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                     void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
-                     QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type);
+void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, int oddBit, int daggerBit,
+                     QudaDslashType dslash_type);
 
-void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                        void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
-                        QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity,
-                        QudaDslashType dslash_type);
+void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit,
+                        ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type);
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 65cbd21dfa..73a1cac005 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -82,19 +82,15 @@ struct StaggeredDslashTestWrapper {
     printfQuda("Calculating reference implementation...");
     switch (dtest_type) {
     case dslash_test_type::Dslash:
-      staggeredDslash(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor,
-                      parity, dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
+      staggeredDslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type);
       break;
     case dslash_test_type::MatPC:
-      staggeredMatDagMat(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor,
-                         mass, 0, inv_param.cpu_prec, gauge_param.cpu_prec, tmpCpu, parity, dslash_type);
+      staggeredMatDagMat(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type);
       break;
     case dslash_test_type::Mat:
       // the !dagger is to reconcile the QUDA convention of D_stag = {{ 2m, -D_{eo}}, -D_{oe}, 2m}} vs the host convention without the minus signs
-      staggeredDslash(spinorRef.Even(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(),
-                      spinor.Odd(), QUDA_EVEN_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
-      staggeredDslash(spinorRef.Odd(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(),
-                      spinor.Even(), QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
+      staggeredDslash(spinorRef.Even(), *cpuFat, *cpuLong, spinor.Odd(), QUDA_EVEN_PARITY, !dagger, dslash_type);
+      staggeredDslash(spinorRef.Odd(), *cpuFat, *cpuLong, spinor.Even(), QUDA_ODD_PARITY, !dagger, dslash_type);
       if (dslash_type == QUDA_LAPLACE_DSLASH) {
         xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
       } else {
@@ -184,12 +180,14 @@ struct StaggeredDslashTestWrapper {
     gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
     gauge_param.location = QUDA_CPU_FIELD_LOCATION;
 
-    GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink);
+    GaugeFieldParam cpuFatParam(gauge_param, qdp_fatlink);
+    cpuFatParam.order = QUDA_QDP_GAUGE_ORDER;
     cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
     cpuFat = GaugeField::Create(cpuFatParam);
 
     gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
-    GaugeFieldParam cpuLongParam(gauge_param, milc_longlink);
+    GaugeFieldParam cpuLongParam(gauge_param, qdp_longlink);
+    cpuLongParam.order = QUDA_QDP_GAUGE_ORDER;
     cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
     cpuLong = GaugeField::Create(cpuLongParam);
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 79d099afb3..210a10c176 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -261,12 +261,14 @@ int main(int argc, char **argv)
   gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
   gauge_param.location = QUDA_CPU_FIELD_LOCATION;
 
-  GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink);
+  GaugeFieldParam cpuFatParam(gauge_param, qdp_fatlink);
+  cpuFatParam.order = QUDA_QDP_GAUGE_ORDER;
   cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuFat = GaugeField::Create(cpuFatParam);
 
   gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
-  GaugeFieldParam cpuLongParam(gauge_param, milc_longlink);
+  GaugeFieldParam cpuLongParam(gauge_param, qdp_longlink);
+  cpuLongParam.order = QUDA_QDP_GAUGE_ORDER;
   cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuLong = GaugeField::Create(cpuLongParam);
 
@@ -361,8 +363,7 @@ int main(int argc, char **argv)
 
     for (int k = 0; k < Nsrc; k++) {
       if (verify_results)
-        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, qdp_fatlink, qdp_longlink, (void **)cpuFat->Ghost(),
-                                 (void **)cpuLong->Ghost(), gauge_param, inv_param, 0);
+        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0);
     }
   } else if (test_type == 5 || test_type == 6) {
     // case 5: // multi mass CG, even parity solution, solving EVEN system
@@ -417,8 +418,7 @@ int main(int argc, char **argv)
 
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], qdp_fatlink, qdp_longlink,
-                                 (void **)cpuFat->Ghost(), (void **)cpuLong->Ghost(), gauge_param, inv_param, i);
+        verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i);
       }
     }
   } else {

From 104c404366bbebbbc3ea0e483663aecdcc8edd0a Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 30 Aug 2023 12:28:01 -0400
Subject: [PATCH 05/53] Various bits of function cleanup, making host verify
 names more consistent with Wilson-type verifies

---
 tests/host_reference/dslash_reference.cpp     | 23 ++++-------
 .../staggered_dslash_reference.cpp            | 39 +++++++++++++++----
 .../staggered_dslash_reference.h              | 11 ++++--
 tests/staggered_dslash_test_utils.h           | 13 ++-----
 4 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 63ed621c80..7841d63d4a 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -747,24 +747,17 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
                                 quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param,
                                 QudaInvertParam &inv_param, int shift)
 {
+  int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
+
   switch (test_type) {
   case 0: // full parity solution, full parity system
   case 1: // full parity solution, solving EVEN EVEN prec system
   case 2: // full parity solution, solving ODD ODD prec system
+    stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
 
-    // In QUDA, the full staggered operator has the sign convention
-    // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not
-    // have the minus sign. Passing in QUDA_DAG_YES solves this
-    // discrepancy.
-    staggeredDslash(ref.Even(), fat_link, long_link, out.Odd(), QUDA_EVEN_PARITY, QUDA_DAG_YES, dslash_type);
-    staggeredDslash(ref.Odd(), fat_link, long_link, out.Even(), QUDA_ODD_PARITY, QUDA_DAG_YES, dslash_type);
-
-    if (dslash_type == QUDA_LAPLACE_DSLASH) {
-      xpay(out.V(), kappa, ref.V(), ref.Length(), gauge_param.cpu_prec);
-      ax(0.5 / kappa, ref.V(), ref.Length(), gauge_param.cpu_prec);
-    } else {
-      axpy(2 * mass, out.V(), ref.V(), ref.Length(), gauge_param.cpu_prec);
-    }
+    // exact reason for this tbd, this isn't needed in the dslash test...
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      ax(0.5 / kappa, ref.V(), ref.Length(), ref.Precision());
     break;
 
   case 3: // even parity solution, solving EVEN system
@@ -772,8 +765,8 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
   case 5: // multi mass CG, even parity solution, solving EVEN system
   case 6: // multi mass CG, odd parity solution, solving ODD system
 
-    staggeredMatDagMat(ref, fat_link, long_link, out, mass, 0, tmp,
-                       (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type);
+    stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp,
+               (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type);
     break;
   }
 
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 24a2932078..c263fa93ef 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -123,8 +123,8 @@ void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink,
   } // 4-d volume
 }
 
-void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                     const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type)
+void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
@@ -157,8 +157,33 @@ void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const Ga
   }
 }
 
-void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit,
-                        ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type)
+void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
+{
+  // assert sPrecision and gPrecision must be the same
+  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
+
+  // assert we have full-parity spinors
+  if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
+    errorQuda("Unexpected site subsets for stag_mat, out %d in %d", out.SiteSubset(), in.SiteSubset());
+
+  // In QUDA, the full staggered operator has the sign convention
+  // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not
+  // have the minus sign. Inverting the expected dagger convention
+  // solves this discrepancy.
+  stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type);
+  stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type);
+
+  if (dslash_type == QUDA_LAPLACE_DSLASH) {
+    double kappa = 1.0 / (8 + mass);
+    xpay((void*)in.V(), kappa, out.V(), out.Length(), out.Precision());
+  } else {
+    axpy(2 * mass, (void*)in.V(), out.V(), out.Length(), out.Precision());
+  }
+}
+
+void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int,
+                ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); }
@@ -172,9 +197,9 @@ void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const
     errorQuda("full parity not supported in function");
   }
 
-  staggeredDslash(tmp, fat_link, long_link, in, otherparity, dagger_bit, dslash_type);
-
-  staggeredDslash(out, fat_link, long_link, tmp, parity, dagger_bit, dslash_type);
+  // dagger bit does not matter
+  stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type);
+  stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type);
 
   double msq_x4 = mass * mass * 4;
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 4a473c114d..9fc6c9d641 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -16,8 +16,11 @@ void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink,
                               real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor,
                               real_t **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type);
 
-void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, int oddBit, int daggerBit,
-                     QudaDslashType dslash_type);
+void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
+                 int oddBit, int daggerBit, QudaDslashType dslash_type);
 
-void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit,
-                        ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type);
+void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
+              double mass, int daggerBit, QudaDslashType dslash_type);
+
+void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
+                double mass, int dagger_bit, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type);
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 73a1cac005..b59d21d6b6 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -82,20 +82,13 @@ struct StaggeredDslashTestWrapper {
     printfQuda("Calculating reference implementation...");
     switch (dtest_type) {
     case dslash_test_type::Dslash:
-      staggeredDslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type);
+      stag_dslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type);
       break;
     case dslash_test_type::MatPC:
-      staggeredMatDagMat(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type);
+      stag_matpc(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type);
       break;
     case dslash_test_type::Mat:
-      // the !dagger is to reconcile the QUDA convention of D_stag = {{ 2m, -D_{eo}}, -D_{oe}, 2m}} vs the host convention without the minus signs
-      staggeredDslash(spinorRef.Even(), *cpuFat, *cpuLong, spinor.Odd(), QUDA_EVEN_PARITY, !dagger, dslash_type);
-      staggeredDslash(spinorRef.Odd(), *cpuFat, *cpuLong, spinor.Even(), QUDA_ODD_PARITY, !dagger, dslash_type);
-      if (dslash_type == QUDA_LAPLACE_DSLASH) {
-        xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
-      } else {
-        axpy(2 * mass, spinor.V(), spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
-      }
+      stag_mat(spinorRef, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type);
       break;
     default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
     }

From 1f8f89c2d0e4e52cb5d077f4710caa399e71b023 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 31 Aug 2023 13:25:33 -0400
Subject: [PATCH 06/53] Added support for mdagm tests for staggered, asqtad

---
 tests/staggered_dslash_ctest.cpp    | 11 ++++++-----
 tests/staggered_dslash_test_utils.h | 15 +++++++++++----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index ceffb74bb8..6945e4ab13 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -20,14 +20,11 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
     QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
 
     if ((QUDA_PRECISION & getPrecision(::testing::get<0>(GetParam()))) == 0
-        || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) {
+        || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0)
       return true;
-    }
 
-    if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) {
-      warningQuda("Fixed precision unsupported for Laplace operator, skipping...");
+    if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1))
       return true;
-    }
 
     const std::array<bool, 16> partition_enabled {true, true, true,  false,  true,  false, false, false,
                                                   true, false, false, false, true, false, true, true};
@@ -102,6 +99,10 @@ int main(int argc, char **argv)
 {
   // initalize google test
   ::testing::InitGoogleTest(&argc, argv);
+
+  // override the default dslash from Wilson
+  dslash_type = QUDA_ASQTAD_DSLASH;
+
   auto app = make_app();
   app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map));
   app->add_option("--all-partitions", ctest_all_partitions, "Test all instead of reduced combination of partitions");
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index b59d21d6b6..5ee65c23f1 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -24,10 +24,12 @@ using namespace quda;
 
 dslash_test_type dtest_type = dslash_test_type::Dslash;
 CLI::TransformPairs<dslash_test_type> dtest_type_map {
-  {"Dslash", dslash_test_type::Dslash}, {"MatPC", dslash_test_type::MatPC}, {"Mat", dslash_test_type::Mat}
-  // left here for completeness but not support in staggered dslash test
+  {"Dslash", dslash_test_type::Dslash},
+  {"MatPC", dslash_test_type::MatPC},
+  {"Mat", dslash_test_type::Mat},
+  {"MatDagMat", dslash_test_type::MatDagMat},
+  // left here for completeness but not supported in staggered dslash test
   // {"MatPCDagMatPC", dslash_test_type::MatPCDagMatPC},
-  // {"MatDagMat", dslash_test_type::MatDagMat},
   // {"M5", dslash_test_type::M5},
   // {"M5inv", dslash_test_type::M5inv},
   // {"Dslash4pre", dslash_test_type::Dslash4pre}
@@ -90,6 +92,10 @@ struct StaggeredDslashTestWrapper {
     case dslash_test_type::Mat:
       stag_mat(spinorRef, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type);
       break;
+    case dslash_test_type::MatDagMat:
+      stag_mat(tmpCpu, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type);
+      stag_mat(spinorRef, *cpuFat, *cpuLong, tmpCpu, mass, 1 - dagger, dslash_type);
+      break;
     default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
     }
   }
@@ -205,7 +211,7 @@ struct StaggeredDslashTestWrapper {
 
     csParam.setPrecision(inv_param.cpu_prec);
     csParam.pad = 0;
-    if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) {
+    if (dtest_type != dslash_test_type::Mat && dtest_type != dslash_test_type::MatDagMat) {
       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
       csParam.x[0] /= 2;
       inv_param.solution_type = QUDA_MATPC_SOLUTION;
@@ -334,6 +340,7 @@ struct StaggeredDslashTestWrapper {
           case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break;
           case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break;
           case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break;
+          case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinor); break;
           default: errorQuda("Test type %d not defined on staggered dslash", static_cast<int>(dtest_type));
           }
         }

From 94a332f1774f52055adb280d8fc488326462ff61 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 12 Sep 2023 11:19:12 -0400
Subject: [PATCH 07/53] Small cleanup of treatment of naik terms

---
 tests/staggered_dslash_ctest.cpp | 21 ++++++--------------
 tests/staggered_dslash_test.cpp  | 20 ++++++-------------
 tests/utils/set_params.cpp       | 34 ++++++++++----------------------
 3 files changed, 22 insertions(+), 53 deletions(-)

diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index 6945e4ab13..669482d96a 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -125,25 +125,16 @@ int main(int argc, char **argv)
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't
   // ask to build the fat/long links... it doesn't make sense.
-  if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) {
+  if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH)
     errorQuda(
       "Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\".\n");
-    compute_fatlong = true;
-  }
 
   // Set n_naiks to 2 if eps_naik != 0.0
-  if (dslash_type == QUDA_ASQTAD_DSLASH) {
-    if (eps_naik != 0.0) {
-      if (compute_fatlong) {
-        n_naiks = 2;
-        printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n");
-      } else {
-        eps_naik = 0.0;
-        printfQuda("Not computing fat-long, ignoring epsilon correction.\n");
-      }
-    } else {
-      printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n");
-    }
+  if (eps_naik != 0.0) {
+    if (compute_fatlong)
+      n_naiks = 2;
+    else
+      eps_naik = 0.0; // to avoid potential headaches
   }
 
   if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat)
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index 7c3524dacf..e24c8092c9 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -88,24 +88,16 @@ int main(int argc, char **argv)
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash,
   // and don't ask to build the fat/long links... it doesn't make sense.
-  if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) {
+  if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH)
     errorQuda(
       "Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\".");
-  }
 
   // Set n_naiks to 2 if eps_naik != 0.0
-  if (dslash_type == QUDA_ASQTAD_DSLASH) {
-    if (eps_naik != 0.0) {
-      if (compute_fatlong) {
-        n_naiks = 2;
-        printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n");
-      } else {
-        eps_naik = 0.0;
-        printfQuda("Not computing fat-long, ignoring epsilon correction.\n");
-      }
-    } else {
-      printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n");
-    }
+  if (eps_naik != 0.0) {
+    if (compute_fatlong)
+      n_naiks = 2;
+    else
+      eps_naik = 0.0; // to avoid potential headaches
   }
 
   if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat)
diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp
index 62cd9e8c90..aa95dbbd34 100644
--- a/tests/utils/set_params.cpp
+++ b/tests/utils/set_params.cpp
@@ -1429,18 +1429,11 @@ void setQudaStaggeredInvTestParams()
   }
 
   // Set n_naiks to 2 if eps_naik != 0.0
-  if (dslash_type == QUDA_ASQTAD_DSLASH) {
-    if (eps_naik != 0.0) {
-      if (compute_fatlong) {
-        n_naiks = 2;
-        printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n");
-      } else {
-        eps_naik = 0.0;
-        printfQuda("Not computing fat-long, ignoring epsilon correction.\n");
-      }
-    } else {
-      printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n");
-    }
+  if (eps_naik != 0.0) {
+    if (compute_fatlong)
+      n_naiks = 2;
+    else
+      eps_naik = 0.0; // to avoid potential headaches
   }
 }
 
@@ -1474,17 +1467,10 @@ void setQudaStaggeredEigTestParams()
   }
 
   // Set n_naiks to 2 if eps_naik != 0.0
-  if (dslash_type == QUDA_ASQTAD_DSLASH) {
-    if (eps_naik != 0.0) {
-      if (compute_fatlong) {
-        n_naiks = 2;
-        printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n");
-      } else {
-        eps_naik = 0.0;
-        printfQuda("Not computing fat-long, ignoring epsilon correction.\n");
-      }
-    } else {
-      printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n");
-    }
+  if (eps_naik != 0.0) {
+    if (compute_fatlong)
+      n_naiks = 2;
+    else
+      eps_naik = 0.0; // to avoid potential headaches
   }
 }

From dd67aa0a932bd8be45d50d93893d3142b04ed293 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 12 Sep 2023 12:47:31 -0400
Subject: [PATCH 08/53] Massive cleanup for staggered_invert/eigensolve_test,
 removed all enumerated test types, set new defaults, improved command line
 arg documentation

---
 tests/host_reference/dslash_reference.cpp |  30 ++---
 tests/host_reference/dslash_reference.h   |   2 +-
 tests/staggered_eigensolve_test.cpp       |  36 ++----
 tests/staggered_invert_test.cpp           |  58 ++-------
 tests/utils/command_line_params.cpp       |  12 +-
 tests/utils/host_utils.h                  |   3 +-
 tests/utils/misc.cpp                      |  17 ---
 tests/utils/misc.h                        |   1 -
 tests/utils/set_params.cpp                | 142 ++++++++--------------
 9 files changed, 100 insertions(+), 201 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 7841d63d4a..2afc145395 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -744,30 +744,30 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 }
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param,
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link,
                                 QudaInvertParam &inv_param, int shift)
 {
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
 
-  switch (test_type) {
-  case 0: // full parity solution, full parity system
-  case 1: // full parity solution, solving EVEN EVEN prec system
-  case 2: // full parity solution, solving ODD ODD prec system
+  if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
     stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
 
     // exact reason for this tbd, this isn't needed in the dslash test...
     if (dslash_type == QUDA_LAPLACE_DSLASH)
       ax(0.5 / kappa, ref.V(), ref.Length(), ref.Precision());
-    break;
-
-  case 3: // even parity solution, solving EVEN system
-  case 4: // odd parity solution, solving ODD system
-  case 5: // multi mass CG, even parity solution, solving EVEN system
-  case 6: // multi mass CG, odd parity solution, solving ODD system
-
-    stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp,
-               (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type);
-    break;
+  } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
+    QudaParity parity = QUDA_INVALID_PARITY;
+    switch (inv_param.matpc_type) {
+      case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
+      case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
+      default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
+    }
+    stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type);
+  } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
+    stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type);
+    stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
+  } else {
+    errorQuda("Invalid staggered solution type %d", inv_param.solution_type);
   }
 
   int len = 0;
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 42f90fed91..32632ddf20 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -109,7 +109,7 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param,
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link,
                                 QudaInvertParam &inv_param, int shift);
 
 // i represents a "half index" into an even or odd "half lattice".
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 911d58a2f9..d5b411e0cf 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -20,10 +20,10 @@
 void display_test_info()
 {
   printfQuda("running the following test:\n");
-  printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon test_type  S_dimension T_dimension\n");
-  printfQuda("%s   %s             %s            %s            %s         %d/%d/%d          %d \n", get_prec_str(prec),
+  printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon S_dimension T_dimension\n");
+  printfQuda("%s   %s             %s            %s         %d/%d/%d          %d \n", get_prec_str(prec),
              get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy),
-             get_staggered_test_type(test_type), xdim, ydim, zdim, tdim);
+             xdim, ydim, zdim, tdim);
 
   printfQuda("\n   Eigensolver parameters\n");
   printfQuda(" - solver mode %s\n", get_eig_type_str(eig_type));
@@ -60,13 +60,11 @@ void display_test_info()
 
 int main(int argc, char **argv)
 {
-  // Set a default
-  solve_type = QUDA_INVALID_SOLVE;
+  // Set defaults
+  setQudaStaggeredDefaultInvTestParams();
 
   auto app = make_app();
   add_eigen_option_group(app);
-  CLI::TransformPairs<int> test_type_map {{"full", 0}, {"even", 3}, {"odd", 4}};
-  app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map));
 
   try {
     app->parse(argc, argv);
@@ -80,15 +78,9 @@ int main(int argc, char **argv)
   // Set values for precisions via the command line.
   setQudaPrecisions();
 
-  // Only these fermions are supported in this file. Ensure a reasonable default,
-  // ensure that the default is improved staggered
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
-    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
-               get_dslash_str(QUDA_ASQTAD_DSLASH));
-    dslash_type = QUDA_ASQTAD_DSLASH;
-  }
-
-  setQudaStaggeredEigTestParams();
+  // Only these fermions are supported in this file
+  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
 
   display_test_info();
 
@@ -167,10 +159,9 @@ int main(int argc, char **argv)
 
   // QUDA eigensolver test
   //----------------------------------------------------------------------------
-  switch (test_type) {
-  case 0: // full parity solution
-  case 3: // even
-  case 4: // odd
+  if ((solve_type == QUDA_DIRECT_SOLVE && solution_type == QUDA_MAT_SOLUTION) ||
+    (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type == QUDA_MATPC_SOLUTION) ||
+    (solve_type == QUDA_NORMOP_SOLVE && solution_type == QUDA_MATDAG_MAT_SOLUTION)) {
     // This function returns the host_evecs and host_evals pointers, populated with
     // the requested data, at the requested prec. All the information needed to
     // perfom the solve is in the eig_param container.
@@ -182,9 +173,8 @@ int main(int argc, char **argv)
     time += (double)clock();
 
     printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", time / CLOCKS_PER_SEC);
-    break;
-
-  default: errorQuda("Unsupported test type");
+  } else {
+    errorQuda("Unsupported combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type));
 
   } // switch
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 210a10c176..6cec447810 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -104,6 +104,7 @@ void display_test_info()
 
 int main(int argc, char **argv)
 {
+  setQudaStaggeredDefaultInvTestParams();
   setQudaDefaultMgTestParams();
   // Parse command line options
   auto app = make_app();
@@ -111,16 +112,13 @@ int main(int argc, char **argv)
   add_deflation_option_group(app);
   add_multigrid_option_group(app);
   add_comms_option_group(app);
-  CLI::TransformPairs<int> test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3},
-                                          {"odd", 4},  {"mcg_even", 5},     {"mcg_odd", 6}};
-  app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map));
+
   try {
     app->parse(argc, argv);
   } catch (const CLI::ParseError &e) {
     return app->exit(e);
   }
   setVerbosity(verbosity);
-  if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE;
 
   if (inv_deflate && inv_multigrid) {
     printfQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n");
@@ -135,13 +133,9 @@ int main(int argc, char **argv)
 
   initRand();
 
-  // Only these fermions are supported in this file. Ensure a reasonable default,
-  // ensure that the default is improved staggered
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
-    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
-               get_dslash_str(QUDA_ASQTAD_DSLASH));
-    dslash_type = QUDA_ASQTAD_DSLASH;
-  }
+  // Only these fermions are supported in this file
+  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
 
   // Need to add support for LAPLACE MG?
   if (inv_multigrid) {
@@ -151,9 +145,6 @@ int main(int argc, char **argv)
     }
   }
 
-  // Deduce operator, solution, and operator preconditioning types
-  if (!inv_multigrid) setQudaStaggeredInvTestParams();
-
   display_test_info();
 
   // Set QUDA internal parameters
@@ -310,27 +301,13 @@ int main(int argc, char **argv)
   std::vector<double> gflops(Nsrc);
   std::vector<int> iter(Nsrc);
 
-  // Pointers for split grid tests
-  std::vector<quda::ColorSpinorField *> _h_b(Nsrc, nullptr);
-  std::vector<quda::ColorSpinorField *> _h_x(Nsrc, nullptr);
+  // Populate `in` with random noise
+  for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); }
 
   // QUDA invert test
   //----------------------------------------------------------------------------
 
-  if (test_type >= 0 && test_type <= 4) {
-    // case 0: // full parity solution, full parity system
-    // case 1: // full parity solution, solving EVEN EVEN prec system
-    // case 2: // full parity solution, solving ODD ODD prec system
-    // case 3: // even parity solution, solving EVEN system
-    // case 4: // odd parity solution, solving ODD system
-
-    if (multishift != 1) {
-      printfQuda("Multishift not supported for test %d\n", test_type);
-      exit(0);
-    }
-
-    for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); }
-
+  if (multishift == 1) {
     if (!use_split_grid) {
       for (int k = 0; k < Nsrc; k++) {
         if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
@@ -363,18 +340,12 @@ int main(int argc, char **argv)
 
     for (int k = 0; k < Nsrc; k++) {
       if (verify_results)
-        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0);
+        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, inv_param, 0);
     }
-  } else if (test_type == 5 || test_type == 6) {
-    // case 5: // multi mass CG, even parity solution, solving EVEN system
-    // case 6: // multi mass CG, odd parity solution, solving ODD system
-
+  } else if (multishift > 1) {
     if (use_split_grid)
       errorQuda("Multishift currently doesn't support split grid.\n");
 
-    if (multishift < 2)
-      errorQuda("Multishift inverter requires more than one shift, multishift = %d\n", multishift);
-
     inv_param.num_offset = multishift;
 
     // Prepare vectors for masses
@@ -418,11 +389,11 @@ int main(int argc, char **argv)
 
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i);
+        verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, inv_param, i);
       }
     }
   } else {
-    errorQuda("Unsupported test type");
+    errorQuda("Invalid number of shifts %d", multishift);
   } // switch
 
   // Compute timings
@@ -457,11 +428,6 @@ int main(int argc, char **argv)
   delete ref;
   delete tmp;
 
-  if (use_split_grid) {
-    for (auto p : _h_b) { delete p; }
-    for (auto p : _h_x) { delete p; }
-  }
-
   // Finalize the QUDA library
   endQuda();
 
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index c56ec2bd14..99e6fb4cbc 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -471,7 +471,7 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
   quda_app->add_option("--device", device_ordinal, "Set the CUDA device to use (default 0, single GPU only)")
     ->check(CLI::Range(0, 16));
 
-  quda_app->add_option("--dslash-type", dslash_type, "Set the dslash type")
+  quda_app->add_option("--dslash-type", dslash_type, "Set the dslash type (default wilson or asqtad as appropriate)")
     ->transform(CLI::QUDACheckedTransformer(dslash_type_map));
 
   quda_app->add_option("--epsilon", epsilon, "Twisted-Mass flavor twist of Dirac operator (default 0.01)");
@@ -499,7 +499,7 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
     ->transform(CLI::QUDACheckedTransformer(mass_normalization_map));
 
   quda_app
-    ->add_option("--matpc", matpc_type, "Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym)")
+    ->add_option("--matpc", matpc_type, "Matrix preconditioning type (even-even (default), odd-odd, even-even-asym, odd-odd-asym)")
     ->transform(CLI::QUDACheckedTransformer(matpc_type_map));
   quda_app->add_option("--msrc", Msrc,
                        "Used for testing non-square block blas routines where nsrc defines the other dimension");
@@ -600,7 +600,7 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
   quda_app
     ->add_option(
       "--solution-type", solution_type,
-      "The solution we desire (mat (default), mat-dag-mat, mat-pc, mat-pc-dag-mat-pc (default for multi-shift))")
+      "The solution we desire (mat (default for Wilson-type), mat-dag-mat, mat-pc (default for staggered-type), mat-pc-dag-mat-pc (default for Wilson-type multi-shift))")
     ->transform(CLI::QUDACheckedTransformer(solution_type_map));
 
   quda_app
@@ -610,7 +610,7 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
 
   quda_app
     ->add_option("--solve-type", solve_type,
-                 "The type of solve to do (direct, direct-pc, normop, normop-pc, normerr, normerr-pc)")
+                 "The type of solve to do (direct, direct-pc (default for staggered-type), normop, normop-pc (default for Wilson-type), normerr, normerr-pc)")
     ->transform(CLI::QUDACheckedTransformer(solve_type_map));
   quda_app
     ->add_option("--solver-ext-lib-type", solver_ext_lib, "Set external library for the solvers  (default Eigen library)")
@@ -752,8 +752,8 @@ void add_eigen_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup->add_option("--eig-use-dagger", eig_use_dagger,
                       "Solve the Mdag problem instead of M (MMdag if eig-use-normop == true) (default false)");
   opgroup->add_option("--eig-use-normop", eig_use_normop,
-                      "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false)");
-  opgroup->add_option("--eig-use-pc", eig_use_pc, "Solve the Even-Odd preconditioned problem (default false)");
+                      "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false for Wilson-type, true for staggered-type)");
+  opgroup->add_option("--eig-use-pc", eig_use_pc, "Solve the Even-Odd preconditioned problem (default false for Wilson-type, true for staggered-type)");
   opgroup->add_option("--eig-use-poly-acc", eig_use_poly_acc, "Use Chebyshev polynomial acceleration in the eigensolver");
 }
 
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 1d2692b25e..23aa99e6df 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -42,8 +42,7 @@ extern QudaPrecision &cuda_prec_ritz;
 
 // Set some basic parameters via command line or use defaults
 // Implemented in set_params.cpp
-void setQudaStaggeredEigTestParams();
-void setQudaStaggeredInvTestParams();
+void setQudaStaggeredDefaultInvTestParams();
 
 // Staggered gauge field utils
 //------------------------------------------------------
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index c07fdb9e5d..f6e7a6a394 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -96,23 +96,6 @@ const char *get_test_type(int t)
   return ret;
 }
 
-const char *get_staggered_test_type(int t)
-{
-  const char *ret;
-  switch (t) {
-  case 0: ret = "full"; break;
-  case 1: ret = "full_ee_prec"; break;
-  case 2: ret = "full_oo_prec"; break;
-  case 3: ret = "even"; break;
-  case 4: ret = "odd"; break;
-  case 5: ret = "mcg_even"; break;
-  case 6: ret = "mcg_odd"; break;
-  default: ret = "unknown"; break;
-  }
-
-  return ret;
-}
-
 const char *get_dslash_str(QudaDslashType type)
 {
   const char *ret;
diff --git a/tests/utils/misc.h b/tests/utils/misc.h
index bac9cf69c9..bf9a8d3039 100644
--- a/tests/utils/misc.h
+++ b/tests/utils/misc.h
@@ -7,7 +7,6 @@ const char *get_recon_str(QudaReconstructType recon);
 const char *get_prec_str(QudaPrecision prec);
 const char *get_gauge_order_str(QudaGaugeFieldOrder order);
 const char *get_test_type(int t);
-const char *get_staggered_test_type(int t);
 const char *get_unitarization_str(bool svd_only);
 const char *get_mass_normalization_str(QudaMassNormalization);
 const char *get_verbosity_str(QudaVerbosity);
diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp
index aa95dbbd34..404401c2d6 100644
--- a/tests/utils/set_params.cpp
+++ b/tests/utils/set_params.cpp
@@ -1379,98 +1379,60 @@ void setDeflationParam(QudaEigParam &df_param)
   df_param.partfile = eig_partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
 }
 
-void setQudaStaggeredInvTestParams()
+/**********/
+// The enumerated staggered tests have been removed, but for reference:
+//
+// Test 0:
+//   solve_type = QUDA_DIRECT_SOLVE
+//   matpc_type = QUDA_MATPC_EVEN_EVEN (doesn't matter)
+//   solution_type = QUDA_MAT_SOLUTION
+//
+// Test 1:
+//   solve_type = QUDA_DIRECT_PC_SOLVE
+//   matpc_type = QUDA_MATPC_EVEN_EVEN
+//   solution_type = QUDA_MAT_SOLUTION
+//
+// Test 2:
+//   solve_type = QUDA_DIRECT_PC_SOLVE
+//   matpc_type = QUDA_MATPC_ODD_ODD
+//   solution_type = QUDA_MAT_SOLUTION
+//
+// Test 3:
+//   solve_type = QUDA_DIRECT_PC_SOLVE
+//   matpc_type = QUDA_MATPC_EVEN_EVEN
+//   solution_type = QUDA_MATPC_SOLUTION
+//
+// Test 4:
+//   solve_type = QUDA_DIRECT_PC_SOLVE
+//   matpc_type = QUDA_MATPC_ODD_ODD
+//   solution_type = QUDA_MATPC_SOLUTION
+//
+// Test 5: multi-shift
+//   solve_type = QUDA_DIRECT_PC_SOLVE
+//   matpc_type = QUDA_MATPC_EVEN_EVEN
+//   solution_type = QUDA_MATPC_SOLUTION
+//
+// Test 6: multi-shift
+//   solve_type = QUDA_DIRECT_PC_SOLVE
+//   matpc_type = QUDA_MATPC_ODD_ODD
+//   solution_type = QUDA_MATPC_SOLUTION
+/**********/
+
+void setQudaStaggeredDefaultInvTestParams()
 {
-  if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    if (test_type != 0) { errorQuda("Test type %d is not supported for the Laplace operator.\n", test_type); }
+  // Set some meaningful defaults for staggered tests
 
-    solve_type = QUDA_DIRECT_SOLVE;
-    solution_type = QUDA_MAT_SOLUTION;
-    matpc_type = QUDA_MATPC_EVEN_EVEN; // doesn't matter
+  // Default to the ASQTAD dslash
+  dslash_type = QUDA_ASQTAD_DSLASH;
 
-  } else {
-
-    if (test_type == 0 && (inv_type == QUDA_CG_INVERTER || inv_type == QUDA_PCG_INVERTER)
-        && solve_type != QUDA_NORMOP_SOLVE && solve_type != QUDA_DIRECT_PC_SOLVE) {
-      warningQuda("The full spinor staggered operator (test 0) can't be inverted with (P)CG. Switching to BiCGstab.\n");
-      inv_type = QUDA_BICGSTAB_INVERTER;
-    }
-
-    if (solve_type == QUDA_INVALID_SOLVE) {
-      if (test_type == 0) {
-        solve_type = QUDA_DIRECT_SOLVE;
-      } else {
-        solve_type = QUDA_DIRECT_PC_SOLVE;
-      }
-    }
-
-    if (test_type == 1 || test_type == 3 || test_type == 5) {
-      matpc_type = QUDA_MATPC_EVEN_EVEN;
-    } else if (test_type == 2 || test_type == 4 || test_type == 6) {
-      matpc_type = QUDA_MATPC_ODD_ODD;
-    } else if (test_type == 0) {
-      matpc_type = QUDA_MATPC_EVEN_EVEN; // it doesn't matter
-    }
-
-    if (test_type == 0 || test_type == 1 || test_type == 2) {
-      solution_type = QUDA_MAT_SOLUTION;
-    } else {
-      solution_type = QUDA_MATPC_SOLUTION;
-    }
-  }
-
-  if (prec_sloppy == QUDA_INVALID_PRECISION) { prec_sloppy = prec; }
-
-  if (prec_refinement_sloppy == QUDA_INVALID_PRECISION) { prec_refinement_sloppy = prec_sloppy; }
-  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) { link_recon_sloppy = link_recon; }
+  // Default to a Schur-preconditioned CG solve
+  solve_type = QUDA_DIRECT_PC_SOLVE;
+  solution_type = QUDA_MATPC_SOLUTION;
+  matpc_type = QUDA_MATPC_EVEN_EVEN;
+  inv_type = QUDA_CG_INVERTER;
 
-  if (inv_type != QUDA_CG_INVERTER && (test_type == 5 || test_type == 6)) {
-    errorQuda("Preconditioning is currently not supported in multi-shift solver solvers");
-  }
-
-  // Set n_naiks to 2 if eps_naik != 0.0
-  if (eps_naik != 0.0) {
-    if (compute_fatlong)
-      n_naiks = 2;
-    else
-      eps_naik = 0.0; // to avoid potential headaches
-  }
-}
-
-void setQudaStaggeredEigTestParams()
-{
-  if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    // LAPLACE operator path, only DIRECT solves feasible.
-    if (test_type != 0) { errorQuda("Test type %d is not supported for the Laplace operator.\n", test_type); }
-    solve_type = QUDA_DIRECT_SOLVE;
-    solution_type = QUDA_MAT_SOLUTION;
-  } else {
-    // STAGGERED operator path
-    if (solve_type == QUDA_INVALID_SOLVE) {
-      if (test_type == 0) {
-        solve_type = QUDA_DIRECT_SOLVE;
-      } else {
-        solve_type = QUDA_DIRECT_PC_SOLVE;
-      }
-    }
-    // If test type is not 3, it is 4 or 0. If 0, the matpc type is irrelevant
-    if (test_type == 3)
-      matpc_type = QUDA_MATPC_EVEN_EVEN;
-    else
-      matpc_type = QUDA_MATPC_ODD_ODD;
-
-    if (test_type == 0) {
-      solution_type = QUDA_MAT_SOLUTION;
-    } else {
-      solution_type = QUDA_MATPC_SOLUTION;
-    }
-  }
-
-  // Set n_naiks to 2 if eps_naik != 0.0
-  if (eps_naik != 0.0) {
-    if (compute_fatlong)
-      n_naiks = 2;
-    else
-      eps_naik = 0.0; // to avoid potential headaches
-  }
+  // For an eigensolve, default to using the "regular" operator instead of the normal
+  // operator because the Schur operator is already HPD
+  eig_use_normop = QUDA_BOOLEAN_FALSE;
+  eig_use_pc = true;
 }

From b39297b0f59dcb31d6f5caafd6f356f1c3856ef8 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 29 Nov 2023 15:19:51 -0800
Subject: [PATCH 09/53] Misc cleanup to make hisq_stencil_test match some
 conventions in staggered_dslash_test_utils

---
 tests/hisq_stencil_test.cpp         | 168 ++++++++++------------------
 tests/staggered_dslash_test_utils.h |   1 -
 2 files changed, 61 insertions(+), 108 deletions(-)

diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp
index 98c2ae91d3..a9103e4e4b 100644
--- a/tests/hisq_stencil_test.cpp
+++ b/tests/hisq_stencil_test.cpp
@@ -46,8 +46,7 @@ static double max_allowed_error = 1e-11;
 
 static void hisq_test()
 {
-
-  QudaGaugeParam qudaGaugeParam;
+  QudaGaugeParam gauge_param;
 
   initQuda(device_ordinal);
 
@@ -55,42 +54,20 @@ static void hisq_test()
     errorQuda("Precision %d is unsupported in some link fattening routines\n", prec);
   }
 
+  if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order);
+
   cpu_prec = prec;
   host_gauge_data_type_size = cpu_prec;
-  qudaGaugeParam = newQudaGaugeParam();
-
-  qudaGaugeParam.anisotropy = 1.0;
-
-  // Fix me: must always be set to 1.0 for reasons not yet discerned.
-  // The tadpole coefficient gets encoded directly into the fat link
-  // construct coefficents.
-  qudaGaugeParam.tadpole_coeff = 1.0;
-
-  qudaGaugeParam.X[0] = xdim;
-  qudaGaugeParam.X[1] = ydim;
-  qudaGaugeParam.X[2] = zdim;
-  qudaGaugeParam.X[3] = tdim;
 
-  setDims(qudaGaugeParam.X);
+  gauge_param = newQudaGaugeParam();
 
-  qudaGaugeParam.cpu_prec = cpu_prec;
-  qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec;
+  setStaggeredGaugeParam(gauge_param);
 
-  if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order);
-
-  qudaGaugeParam.gauge_order = gauge_order;
-  qudaGaugeParam.type = QUDA_WILSON_LINKS;
-  qudaGaugeParam.reconstruct = qudaGaugeParam.reconstruct_sloppy = link_recon;
-  qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
-  qudaGaugeParam.staggered_phase_type = QUDA_STAGGERED_PHASE_MILC;
-  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
-  qudaGaugeParam.ga_pad = 0;
+  setDims(gauge_param.X);
 
-  // Needed for unitarization, following "unitarize_link_test.cpp"
-  GaugeFieldParam gParam(qudaGaugeParam);
-  gParam.link_type = QUDA_GENERAL_LINKS;
-  gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-  gParam.order = gauge_order;
+  gauge_param.cpu_prec = cpu_prec;
+  gauge_param.cuda_prec_sloppy = cuda_prec;
+  gauge_param.reconstruct_sloppy = link_recon;
 
   ///////////////////////////////////////////////////////////////
   // Set up the coefficients for each part of the HISQ stencil //
@@ -148,21 +125,14 @@ static void hisq_test()
   // Input links //
   /////////////////
 
-  void *sitelink[4];
-  for (int i = 0; i < 4; i++) sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
+  void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr};
+  for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
 
-  void *milc_sitelink;
-  milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+  void *milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
   // Note: this could be replaced with loading a gauge field
-  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 0); // 0 -> no phases
-  for (int i = 0; i < V; ++i) {
-    for (int dir = 0; dir < 4; ++dir) {
-      char *src = (char *)sitelink[dir];
-      memcpy((char *)milc_sitelink + (i * 4 + dir) * gauge_site_size * host_gauge_data_type_size,
-             src + i * gauge_site_size * host_gauge_data_type_size, gauge_site_size * host_gauge_data_type_size);
-    }
-  }
+  createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases
+  reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
 
   //////////////////////
   // Perform GPU test //
@@ -187,7 +157,7 @@ static void hisq_test()
   // Tuning run...
   {
     printfQuda("Tuning...\n");
-    computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &qudaGaugeParam);
+    computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &gauge_param);
   }
 
   struct timeval t0, t1;
@@ -198,11 +168,11 @@ static void hisq_test()
     // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
 
     // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
-    computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &qudaGaugeParam);
+    computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param);
 
     if (n_naiks > 1) {
       // Create Naiks, 3rd path table set
-      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &qudaGaugeParam);
+      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param);
 
       // Rescale+copy Naiks into Naik field
       cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
@@ -213,7 +183,7 @@ static void hisq_test()
     }
 
     // Create X and long links, 2nd path table set
-    computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &qudaGaugeParam);
+    computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param);
 
     if (n_naiks > 1) {
       // Add into Naik field
@@ -229,24 +199,26 @@ static void hisq_test()
   // Perform CPU Build //
   ///////////////////////
 
-  void *long_reflink[4]; // Long link for fermion with zero epsilon
-  void *fat_reflink[4];  // Fat link for fermion with zero epsilon
+  // fat and long links for fermions with zero epsilon
+  void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
   for (int i = 0; i < 4; i++) {
-    long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
     fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
   }
 
-  void *long_reflink_eps[4]; // Long link for fermion with non-zero epsilon
-  void *fat_reflink_eps[4];  // Fat link for fermion with non-zero epsilon
+  // fat and long links for fermions with non-zero epsilon
+  void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
   if (n_naiks > 1) {
     for (int i = 0; i < 4; i++) {
-      long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
       fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
     }
   }
 
   if (verify_results) {
-    computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, sitelink, &qudaGaugeParam,
+    computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param,
                         act_paths, eps_naik);
   }
 
@@ -254,45 +226,25 @@ static void hisq_test()
   // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
   ////////////////////////////////////////////////////////////////////
 
-  void *myfatlink[4];
-  void *mylonglink[4];
-  void *myfatlink_eps[4];
-  void *mylonglink_eps[4];
+  void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+  void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
   for (int i = 0; i < 4; i++) {
-
-    myfatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    mylonglink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    memset(myfatlink[i], 0, V * gauge_site_size * host_gauge_data_type_size);
-    memset(mylonglink[i], 0, V * gauge_site_size * host_gauge_data_type_size);
-
+    qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
     if (n_naiks > 1) {
-      myfatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      mylonglink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      memset(myfatlink_eps[i], 0, V * gauge_site_size * host_gauge_data_type_size);
-      memset(mylonglink_eps[i], 0, V * gauge_site_size * host_gauge_data_type_size);
+      qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
     }
   }
 
-  for (int i = 0; i < V; i++) {
-    for (int dir = 0; dir < 4; dir++) {
-      char *src = ((char *)fatlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
-      char *dst = ((char *)myfatlink[dir]) + i * gauge_site_size * host_gauge_data_type_size;
-      memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
-
-      src = ((char *)longlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
-      dst = ((char *)mylonglink[dir]) + i * gauge_site_size * host_gauge_data_type_size;
-      memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
-
-      if (n_naiks > 1) {
-        src = ((char *)fatlink_eps) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
-        dst = ((char *)myfatlink_eps[dir]) + i * gauge_site_size * host_gauge_data_type_size;
-        memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
-
-        src = ((char *)longlink_eps) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
-        dst = ((char *)mylonglink_eps[dir]) + i * gauge_site_size * host_gauge_data_type_size;
-        memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
-      }
-    }
+  reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+  reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+
+  if (n_naiks > 1) {
+    reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
   }
 
   //////////////////////////////
@@ -303,20 +255,20 @@ static void hisq_test()
     printfQuda("Checking fat links...\n");
     int res = 1;
     for (int dir = 0; dir < 4; dir++) {
-      res &= compare_floats(fat_reflink[dir], myfatlink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec);
+      res &= compare_floats(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec);
     }
 
-    strong_check_link(myfatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec);
+    strong_check_link(qdp_fatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
 
     printfQuda("Fat-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
 
     printfQuda("Checking long links...\n");
     res = 1;
     for (int dir = 0; dir < 4; ++dir) {
-      res &= compare_floats(long_reflink[dir], mylonglink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec);
+      res &= compare_floats(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec);
     }
 
-    strong_check_link(mylonglink, "GPU results: ", long_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec);
+    strong_check_link(qdp_longlink, "GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
 
     printfQuda("Long-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
 
@@ -325,31 +277,31 @@ static void hisq_test()
       printfQuda("Checking fat eps_naik links...\n");
       res = 1;
       for (int dir = 0; dir < 4; dir++) {
-        res &= compare_floats(fat_reflink_eps[dir], myfatlink_eps[dir], V * gauge_site_size, 1e-3,
-                              qudaGaugeParam.cpu_prec);
+        res &= compare_floats(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, 1e-3,
+                              gauge_param.cpu_prec);
       }
 
-      strong_check_link(myfatlink_eps, "GPU results: ", fat_reflink_eps, "CPU reference results:", V,
-                        qudaGaugeParam.cpu_prec);
+      strong_check_link(qdp_fatlink_eps, "GPU results: ", fat_reflink_eps, "CPU reference results:", V,
+                        gauge_param.cpu_prec);
 
       printfQuda("Fat-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
 
       printfQuda("Checking long eps_naik links...\n");
       res = 1;
       for (int dir = 0; dir < 4; ++dir) {
-        res &= compare_floats(long_reflink_eps[dir], mylonglink_eps[dir], V * gauge_site_size, 1e-3,
-                              qudaGaugeParam.cpu_prec);
+        res &= compare_floats(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, 1e-3,
+                              gauge_param.cpu_prec);
       }
 
-      strong_check_link(mylonglink_eps, "GPU results: ", long_reflink_eps, "CPU reference results:", V,
-                        qudaGaugeParam.cpu_prec);
+      strong_check_link(qdp_longlink_eps, "GPU results: ", long_reflink_eps, "CPU reference results:", V,
+                        gauge_param.cpu_prec);
 
       printfQuda("Long-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
     }
   }
 
   // FIXME: does not include unitarization, extra naiks
-  int volume = qudaGaugeParam.X[0] * qudaGaugeParam.X[1] * qudaGaugeParam.X[2] * qudaGaugeParam.X[3];
+  int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3];
   long long flops = 61632 * (long long)niter; // Constructing V field
   // Constructing W field?
   // Constructing separate Naiks
@@ -360,16 +312,16 @@ static void hisq_test()
   printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf);
 
   for (int i = 0; i < 4; i++) {
-    host_free(myfatlink[i]);
-    host_free(mylonglink[i]);
+    host_free(qdp_fatlink[i]);
+    host_free(qdp_longlink[i]);
     if (n_naiks > 1) {
-      host_free(myfatlink_eps[i]);
-      host_free(mylonglink_eps[i]);
+      host_free(qdp_fatlink_eps[i]);
+      host_free(qdp_longlink_eps[i]);
     }
   }
 
   for (int i = 0; i < 4; i++) {
-    host_free(sitelink[i]);
+    host_free(qdp_sitelink[i]);
     host_free(fat_reflink[i]);
     host_free(long_reflink[i]);
     if (n_naiks > 1) {
@@ -433,6 +385,8 @@ int main(int argc, char **argv)
 
   if (eps_naik != 0.0) { n_naiks = 2; }
 
+  setVerbosity(verbosity);
+
   initComms(argc, argv, gridsize_from_cmdline);
   display_test_info();
   hisq_test();
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 264371f2f0..246dcdfea4 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -221,7 +221,6 @@ struct StaggeredDslashTestWrapper {
     // set verbosity prior to loadGaugeQuda
     setVerbosity(verbosity);
 
-    
   }
 
   void init()

From a1049cf823960d5175f5763d223db5c2023e680f Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 29 Nov 2023 21:37:58 -0800
Subject: [PATCH 10/53] Significant refactoring to hist_stencil_test, getting
 closer to simple test and ctest flavors

---
 tests/hisq_stencil_test.cpp | 506 ++++++++++++++++++++----------------
 1 file changed, 288 insertions(+), 218 deletions(-)

diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp
index a9103e4e4b..77f7933a40 100644
--- a/tests/hisq_stencil_test.cpp
+++ b/tests/hisq_stencil_test.cpp
@@ -1,21 +1,16 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/time.h>
 
-#include "quda.h"
-#include "gauge_field.h"
-#include "host_utils.h"
+#include <quda.h>
+#include <gauge_field.h>
+#include <host_utils.h>
 #include <command_line_params.h>
-#include "misc.h"
-#include "util_quda.h"
-#include "malloc_quda.h"
+#include <misc.h>
 #include <unitarization_links.h>
-#include "ks_improved_force.h"
+#include <ks_improved_force.h>
 
-#ifdef MULTI_GPU
-#include "comm_quda.h"
-#endif
+#include <tune_quda.h>
 
 #define TDIFF(a, b) (b.tv_sec - a.tv_sec + 0.000001 * (b.tv_usec - a.tv_usec))
 
@@ -36,222 +31,322 @@ static double svd_rel_error = 1e-4;
 static double svd_abs_error = 1e-4;
 static double max_allowed_error = 1e-11;
 
-/*--------------------------------------------------------------------*/
-// Some notation:
-// U -- original link, SU(3), copied to "field" from "site"
-// V -- after 1st level of smearing, non-SU(3)
-// W -- unitarized, SU(3)
-// X -- after 2nd level of smearing, non-SU(3)
-/*--------------------------------------------------------------------*/
+struct HisqStencilTestWrapper {
 
-static void hisq_test()
-{
-  QudaGaugeParam gauge_param;
+  static inline QudaGaugeParam gauge_param;
 
-  initQuda(device_ordinal);
+  // staple coefficients for different portions of the HISQ stencil build
+  static inline std::array<std::array<double, 6>, 3> act_paths;
 
-  if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION) {
-    errorQuda("Precision %d is unsupported in some link fattening routines\n", prec);
-  }
+  // initial links in MILC order
+  static inline void* milc_sitelink = nullptr;
 
-  if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order);
+  // storage for CPU reference fat and long links w/zero Naik
+  static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
 
-  cpu_prec = prec;
-  host_gauge_data_type_size = cpu_prec;
+  // storage for CPU reference fat and long links w/non-zero Naik
+  static inline void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
 
-  gauge_param = newQudaGaugeParam();
+  // Paths for step 1:
+  static inline void *vlink = nullptr;
+  static inline void *wlink = nullptr;
 
-  setStaggeredGaugeParam(gauge_param);
+  // Paths for step 2:
+  static inline void *fatlink = nullptr;
+  static inline void *longlink = nullptr;
 
-  setDims(gauge_param.X);
+  // Place to accumulate Naiks
+  static inline void *fatlink_eps = nullptr;
+  static inline void *longlink_eps = nullptr;
 
-  gauge_param.cpu_prec = cpu_prec;
-  gauge_param.cuda_prec_sloppy = cuda_prec;
-  gauge_param.reconstruct_sloppy = link_recon;
+  static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
 
-  ///////////////////////////////////////////////////////////////
-  // Set up the coefficients for each part of the HISQ stencil //
-  ///////////////////////////////////////////////////////////////
+  void init_test() {
+    cpu_prec = prec;
+    host_gauge_data_type_size = cpu_prec;
 
-  // Reference: "generic_ks/imp_actions/hisq/hisq_action.h",
-  // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c
+    gauge_param = newQudaGaugeParam();
+    setStaggeredGaugeParam(gauge_param);
 
-  double u1 = 1.0 / tadpole_factor;
-  double u2 = u1 * u1;
-  double u4 = u2 * u2;
-  double u6 = u4 * u2;
+    static bool first_time = true;
+    if (first_time) {
+      init_host();
+      first_time = false;
+    }
+  }
 
-  std::array<std::array<double, 6>, 3> act_paths;
+  void init_host() {
+    setDims(gauge_param.X);
+    dw_setDims(gauge_param.X, 1);
+
+    ///////////////////////////////////////////////////////////////
+    // Set up the coefficients for each part of the HISQ stencil //
+    ///////////////////////////////////////////////////////////////
+
+    // Reference: "generic_ks/imp_actions/hisq/hisq_action.h",
+    // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c
+
+    double u1 = 1.0 / tadpole_factor;
+    double u2 = u1 * u1;
+    double u4 = u2 * u2;
+    double u6 = u4 * u2;
+
+    // First path: create V, W links
+    act_paths[0] = {
+      (1.0 / 8.0),                             /* one link */
+      u2 * (0.0),                              /* Naik */
+      u2 * (-1.0 / 8.0) * 0.5,                 /* simple staple */
+      u4 * (1.0 / 8.0) * 0.25 * 0.5,           /* displace link in two directions */
+      u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */
+      u4 * (0.0)                               /* Lepage term */
+    };
+
+    // Second path: create X, long links
+    act_paths[1] = {
+      ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */
+                                                        /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
+      (-1.0 / 24.0),                                    /* Naik */
+      (-1.0 / 8.0) * 0.5,                               /* simple staple */
+      (1.0 / 8.0) * 0.25 * 0.5,                         /* displace link in two directions */
+      (-1.0 / 8.0) * 0.125 * (1.0 / 6.0),               /* displace link in three directions */
+      (-2.0 / 16.0)                                     /* Lepage term, correct O(a^2) 2x ASQTAD */
+    };
+
+    // Paths for epsilon corrections. Not used if n_naiks = 1.
+    act_paths[2] = {
+      (1.0 / 8.0),   /* one link b/c of Naik */
+      (-1.0 / 24.0), /* Naik */
+      0.0,           /* simple staple */
+      0.0,           /* displace link in two directions */
+      0.0,           /* displace link in three directions */
+      0.0            /* Lepage term */
+    };
+
+    ////////////////////////////////////
+    // Set unitarization coefficients //
+    ////////////////////////////////////
+
+    setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
+                              svd_abs_error);
+
+    /////////////////
+    // Input links //
+    /////////////////
+
+    void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr};
+    for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
+
+    milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+
+    // Note: this could be replaced with loading a gauge field
+    createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases
+    reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+
+    ///////////////////////
+    // Perform CPU Build //
+    ///////////////////////
 
-  // First path: create V, W links
-  act_paths[0] = {
-    (1.0 / 8.0),                             /* one link */
-    u2 * (0.0),                              /* Naik */
-    u2 * (-1.0 / 8.0) * 0.5,                 /* simple staple */
-    u4 * (1.0 / 8.0) * 0.25 * 0.5,           /* displace link in two directions */
-    u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */
-    u4 * (0.0)                               /* Lepage term */
-  };
+    for (int i = 0; i < 4; i++) {
+      // fat and long links for fermions with zero epsilon
+      fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+
+      // fat and long links for fermions with non-zero epsilon
+      if (n_naiks > 1) {
+        fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+        long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      }
+    }
 
-  // Second path: create X, long links
-  act_paths[1] = {
-    ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */
-                                                      /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
-    (-1.0 / 24.0),                                    /* Naik */
-    (-1.0 / 8.0) * 0.5,                               /* simple staple */
-    (1.0 / 8.0) * 0.25 * 0.5,                         /* displace link in two directions */
-    (-1.0 / 8.0) * 0.125 * (1.0 / 6.0),               /* displace link in three directions */
-    (-2.0 / 16.0)                                     /* Lepage term, correct O(a^2) 2x ASQTAD */
-  };
+    computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param,
+                        act_paths, eps_naik);
 
-  // Paths for epsilon corrections. Not used if n_naiks = 1.
-  act_paths[2] = {
-    (1.0 / 8.0),   /* one link b/c of Naik */
-    (-1.0 / 24.0), /* Naik */
-    0.0,           /* simple staple */
-    0.0,           /* displace link in two directions */
-    0.0,           /* displace link in three directions */
-    0.0            /* Lepage term */
-  };
+    ///////////////////////////////////////////////////////
+    // Allocate host storage for fields built on the GPU //
+    ///////////////////////////////////////////////////////
 
-  ////////////////////////////////////
-  // Set unitarization coefficients //
-  ////////////////////////////////////
+    // Paths for step 1:
+    vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links
+    wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links
 
-  setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
-                             svd_abs_error);
+    // Paths for step 2:
+    fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // final fat ("X") links
+    longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links
 
-  /////////////////
-  // Input links //
-  /////////////////
+    // Place to accumulate Naiks
+    if (n_naiks > 1) {
+      fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // epsilon fat links
+      longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks
+    }
 
-  void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr};
-  for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    // QDP order fields
+    for (int i = 0; i < 4; i++) {
+      qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      if (n_naiks > 1) {
+        qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+        qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      }
+    }
 
-  void *milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    /////////////////////////////////////////////////////////
+    // Free allocations that are only needed for CPU setup //
+    /////////////////////////////////////////////////////////
 
-  // Note: this could be replaced with loading a gauge field
-  createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases
-  reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    for (int i = 0; i < 4; i++)
+      host_free(qdp_sitelink[i]);
+  }
 
-  //////////////////////
-  // Perform GPU test //
-  //////////////////////
+  static void destroy() {
+    if (milc_sitelink) host_free(milc_sitelink);
 
-  // Paths for step 1:
-  void *vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links
-  void *wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links
+    for (int i = 0; i < 4; i++) {
+      host_free(fat_reflink[i]);
+      host_free(long_reflink[i]);
+      if (n_naiks > 1) {
+        host_free(fat_reflink_eps[i]);
+        host_free(long_reflink_eps[i]);
+      }
+    }
 
-  // Paths for step 2:
-  void *fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // final fat ("X") links
-  void *longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links
+    // Clean up GPU compute links
+    host_free(vlink);
+    host_free(wlink);
+    host_free(fatlink);
+    host_free(longlink);
 
-  // Place to accumulate Naiks
-  void *fatlink_eps = nullptr;
-  void *longlink_eps = nullptr;
-  if (n_naiks > 1) {
-    fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // epsilon fat links
-    longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks
-  }
+    if (n_naiks > 1) {
+      host_free(fatlink_eps);
+      host_free(longlink_eps);
+    }
 
-  // Tuning run...
-  {
-    printfQuda("Tuning...\n");
-    computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &gauge_param);
+    for (int i = 0; i < 4; i++) {
+      host_free(qdp_fatlink[i]);
+      host_free(qdp_longlink[i]);
+      if (n_naiks > 1) {
+        host_free(qdp_fatlink_eps[i]);
+        host_free(qdp_longlink_eps[i]);
+      }
+    }
+
+#ifdef MULTI_GPU
+    exchange_llfat_cleanup();
+#endif
   }
 
-  struct timeval t0, t1;
-  printfQuda("Running %d iterations of computation\n", niter);
-  gettimeofday(&t0, NULL);
-  for (int n = 0; n < niter; n++) {
+  /*--------------------------------------------------------------------*/
+  // Some notation:
+  // U -- original link, SU(3), copied to "field" from "site"
+  // V -- after 1st level of smearing, non-SU(3)
+  // W -- unitarized, SU(3)
+  // X -- after 2nd level of smearing, non-SU(3)
+  /*--------------------------------------------------------------------*/
 
-    // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
+  double llfatCUDA(int niter) {
+    host_timer_t host_timer;
 
-    // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
-    computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param);
+    comm_barrier();
+    host_timer.start();
 
-    if (n_naiks > 1) {
-      // Create Naiks, 3rd path table set
-      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param);
-
-      // Rescale+copy Naiks into Naik field
-      cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
-      cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size);
-    } else {
-      memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
-      memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
-    }
+    for (int i = 0; i < niter; i++) {
+      // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
 
-    // Create X and long links, 2nd path table set
-    computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param);
+      // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
+      computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param);
 
-    if (n_naiks > 1) {
-      // Add into Naik field
-      cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size);
-      cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size);
+      if (n_naiks > 1) {
+        // Create Naiks, 3rd path table set
+        computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param);
+
+        // Rescale+copy Naiks into Naik field
+        cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
+        cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size);
+      } else {
+        memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
+        memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
+      }
+
+      // Create X and long links, 2nd path table set
+      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param);
+
+      if (n_naiks > 1) {
+        // Add into Naik field
+        cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size);
+        cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size);
+      }
     }
+
+    host_timer.stop();
+
+    return host_timer.last();
   }
-  gettimeofday(&t1, NULL);
 
-  double secs = TDIFF(t0, t1);
+  void run_test(int niter, bool print_metrics = false) {
+    //////////////////////
+    // Perform GPU test //
+    //////////////////////
 
-  ///////////////////////
-  // Perform CPU Build //
-  ///////////////////////
+    printfQuda("Tuning...\n");
+    llfatCUDA(1);
 
-  // fat and long links for fermions with zero epsilon
-  void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
-  for (int i = 0; i < 4; i++) {
-    fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-  }
+    auto flops0 = quda::Tunable::flops_global();
+    auto bytes0 = quda::Tunable::bytes_global();
 
-  // fat and long links for fermions with non-zero epsilon
-  void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-  if (n_naiks > 1) {
-    for (int i = 0; i < 4; i++) {
-      fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    }
-  }
+    printfQuda("Running %d iterations of computation\n", niter);
+    double secs = llfatCUDA(niter);
 
-  if (verify_results) {
-    computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param,
-                        act_paths, eps_naik);
-  }
+    unsigned long long flops = (quda::Tunable::flops_global() - flops0);
+    unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
 
-  ////////////////////////////////////////////////////////////////////
-  // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
-  ////////////////////////////////////////////////////////////////////
-
-  void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-  for (int i = 0; i < 4; i++) {
-    qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    if (n_naiks > 1) {
-      qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    if (print_metrics) {
+      // FIXME: does not include unitarization, extra naiks
+      int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3];
+      //long long flops = 61632 * (long long)niter; // Constructing V field
+      // Constructing W field?
+      // Constructing separate Naiks
+      //flops += 61632 * (long long)niter;     // Constructing X field
+      //flops += (252 * 4) * (long long)niter; // long-link contribution
+
+      printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter);
+
+      printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter,
+                    (flops / niter) / volume, (bytes / niter) / volume);
+
+      double gflops = 1.0e-9 * flops / secs;
+        printfQuda("GFLOPS = %f\n", gflops);
+
+      double gbytes = 1.0e-9 * bytes / secs;
+        printfQuda("GBYTES = %f\n", gbytes);
+
+      // Old metric
+      //double perf = flops / (secs * 1024 * 1024 * 1024);
+      //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf);
     }
   }
 
-  reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-  reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+  void verify()
+  {
+    ////////////////////////////////////////////////////////////////////
+    // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
+    ////////////////////////////////////////////////////////////////////
 
-  if (n_naiks > 1) {
-    reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-  }
+    reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+
+    if (n_naiks > 1) {
+      reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+      reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    }
 
-  //////////////////////////////
-  // Perform the verification //
-  //////////////////////////////
+    //////////////////////////////
+    // Perform the verification //
+    //////////////////////////////
 
-  if (verify_results) {
     printfQuda("Checking fat links...\n");
     int res = 1;
     for (int dir = 0; dir < 4; dir++) {
@@ -299,52 +394,22 @@ static void hisq_test()
       printfQuda("Long-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
     }
   }
+};
 
-  // FIXME: does not include unitarization, extra naiks
-  int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3];
-  long long flops = 61632 * (long long)niter; // Constructing V field
-  // Constructing W field?
-  // Constructing separate Naiks
-  flops += 61632 * (long long)niter;     // Constructing X field
-  flops += (252 * 4) * (long long)niter; // long-link contribution
+static void hisq_test()
+{
+  initQuda(device_ordinal);
 
-  double perf = flops * volume / (secs * 1024 * 1024 * 1024);
-  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf);
+  HisqStencilTestWrapper test_wrapper;
 
-  for (int i = 0; i < 4; i++) {
-    host_free(qdp_fatlink[i]);
-    host_free(qdp_longlink[i]);
-    if (n_naiks > 1) {
-      host_free(qdp_fatlink_eps[i]);
-      host_free(qdp_longlink_eps[i]);
-    }
-  }
+  test_wrapper.init_test();
 
-  for (int i = 0; i < 4; i++) {
-    host_free(qdp_sitelink[i]);
-    host_free(fat_reflink[i]);
-    host_free(long_reflink[i]);
-    if (n_naiks > 1) {
-      host_free(fat_reflink_eps[i]);
-      host_free(long_reflink_eps[i]);
-    }
-  }
+  test_wrapper.run_test(niter, true);
 
-  // Clean up GPU compute links
-  host_free(vlink);
-  host_free(wlink);
-  host_free(fatlink);
-  host_free(longlink);
+  test_wrapper.verify();
 
-  if (n_naiks > 1) {
-    host_free(fatlink_eps);
-    host_free(longlink_eps);
-  }
+  test_wrapper.destroy();
 
-  if (milc_sitelink) host_free(milc_sitelink);
-#ifdef MULTI_GPU
-  exchange_llfat_cleanup();
-#endif
   endQuda();
 }
 
@@ -383,6 +448,11 @@ int main(int argc, char **argv)
     return app->exit(e);
   }
 
+  if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION)
+    errorQuda("Precision %d is unsupported in some link fattening routines\n", prec);
+
+  if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order);
+
   if (eps_naik != 0.0) { n_naiks = 2; }
 
   setVerbosity(verbosity);

From 14f1407542f49c152a2e51c8dc542201a508c4a7 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 29 Nov 2023 22:37:25 -0800
Subject: [PATCH 11/53] hisq_stencil_test now runs via gtest, creating a ctest
 is outstanding

---
 tests/hisq_stencil_test.cpp     | 469 ++++----------------------------
 tests/hisq_stencil_test_utils.h | 402 +++++++++++++++++++++++++++
 2 files changed, 458 insertions(+), 413 deletions(-)
 create mode 100644 tests/hisq_stencil_test_utils.h

diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp
index 77f7933a40..138bba9eea 100644
--- a/tests/hisq_stencil_test.cpp
+++ b/tests/hisq_stencil_test.cpp
@@ -1,435 +1,75 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <quda.h>
-#include <gauge_field.h>
-#include <host_utils.h>
-#include <command_line_params.h>
-#include <misc.h>
-#include <unitarization_links.h>
-#include <ks_improved_force.h>
-
-#include <tune_quda.h>
-
-#define TDIFF(a, b) (b.tv_sec - a.tv_sec + 0.000001 * (b.tv_usec - a.tv_usec))
+#include "hisq_stencil_test_utils.h"
 
 using namespace quda;
 
-// Number of naiks. If eps_naik is 0.0, we only need
-// to construct one naik.
-static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER;
-
-// The file "generic_ks/fermion_links_hisq_load_milc.c"
-// within MILC is the ultimate reference for what's going on here.
-
-// Unitarization coefficients
-static double unitarize_eps = 1e-6;
-static bool reunit_allow_svd = true;
-static bool reunit_svd_only = false;
-static double svd_rel_error = 1e-4;
-static double svd_abs_error = 1e-4;
-static double max_allowed_error = 1e-11;
-
-struct HisqStencilTestWrapper {
-
-  static inline QudaGaugeParam gauge_param;
-
-  // staple coefficients for different portions of the HISQ stencil build
-  static inline std::array<std::array<double, 6>, 3> act_paths;
-
-  // initial links in MILC order
-  static inline void* milc_sitelink = nullptr;
-
-  // storage for CPU reference fat and long links w/zero Naik
-  static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
-  static inline void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
-
-  // storage for CPU reference fat and long links w/non-zero Naik
-  static inline void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-  static inline void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-
-  // Paths for step 1:
-  static inline void *vlink = nullptr;
-  static inline void *wlink = nullptr;
-
-  // Paths for step 2:
-  static inline void *fatlink = nullptr;
-  static inline void *longlink = nullptr;
-
-  // Place to accumulate Naiks
-  static inline void *fatlink_eps = nullptr;
-  static inline void *longlink_eps = nullptr;
-
-  static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-  static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
-
-  void init_test() {
-    cpu_prec = prec;
-    host_gauge_data_type_size = cpu_prec;
-
-    gauge_param = newQudaGaugeParam();
-    setStaggeredGaugeParam(gauge_param);
-
-    static bool first_time = true;
-    if (first_time) {
-      init_host();
-      first_time = false;
-    }
+class HisqStencilTest : public ::testing::Test
+{
+protected:
+  HisqStencilTestWrapper hisq_stencil_test_wrapper;
+
+  void display_test_info() {
+    printfQuda("running the following test:\n");
+    printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       Ordering\n");
+    printfQuda("%s                       %s                         %d/%d/%d/                  %d             %s \n",
+              get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
+    printfQuda("Grid partition info:     X  Y  Z  T\n");
+    printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
+              dimPartitioned(3));
+    printfQuda("Number of Naiks: %d\n", n_naiks);
   }
 
-  void init_host() {
-    setDims(gauge_param.X);
-    dw_setDims(gauge_param.X, 1);
-
-    ///////////////////////////////////////////////////////////////
-    // Set up the coefficients for each part of the HISQ stencil //
-    ///////////////////////////////////////////////////////////////
-
-    // Reference: "generic_ks/imp_actions/hisq/hisq_action.h",
-    // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c
-
-    double u1 = 1.0 / tadpole_factor;
-    double u2 = u1 * u1;
-    double u4 = u2 * u2;
-    double u6 = u4 * u2;
-
-    // First path: create V, W links
-    act_paths[0] = {
-      (1.0 / 8.0),                             /* one link */
-      u2 * (0.0),                              /* Naik */
-      u2 * (-1.0 / 8.0) * 0.5,                 /* simple staple */
-      u4 * (1.0 / 8.0) * 0.25 * 0.5,           /* displace link in two directions */
-      u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */
-      u4 * (0.0)                               /* Lepage term */
-    };
-
-    // Second path: create X, long links
-    act_paths[1] = {
-      ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */
-                                                        /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
-      (-1.0 / 24.0),                                    /* Naik */
-      (-1.0 / 8.0) * 0.5,                               /* simple staple */
-      (1.0 / 8.0) * 0.25 * 0.5,                         /* displace link in two directions */
-      (-1.0 / 8.0) * 0.125 * (1.0 / 6.0),               /* displace link in three directions */
-      (-2.0 / 16.0)                                     /* Lepage term, correct O(a^2) 2x ASQTAD */
-    };
-
-    // Paths for epsilon corrections. Not used if n_naiks = 1.
-    act_paths[2] = {
-      (1.0 / 8.0),   /* one link b/c of Naik */
-      (-1.0 / 24.0), /* Naik */
-      0.0,           /* simple staple */
-      0.0,           /* displace link in two directions */
-      0.0,           /* displace link in three directions */
-      0.0            /* Lepage term */
-    };
-
-    ////////////////////////////////////
-    // Set unitarization coefficients //
-    ////////////////////////////////////
-
-    setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
-                              svd_abs_error);
-
-    /////////////////
-    // Input links //
-    /////////////////
-
-    void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr};
-    for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
-    milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-
-    // Note: this could be replaced with loading a gauge field
-    createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases
-    reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-
-    ///////////////////////
-    // Perform CPU Build //
-    ///////////////////////
-
-    for (int i = 0; i < 4; i++) {
-      // fat and long links for fermions with zero epsilon
-      fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
-      // fat and long links for fermions with non-zero epsilon
-      if (n_naiks > 1) {
-        fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-        long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      }
-    }
-
-    computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param,
-                        act_paths, eps_naik);
-
-    ///////////////////////////////////////////////////////
-    // Allocate host storage for fields built on the GPU //
-    ///////////////////////////////////////////////////////
-
-    // Paths for step 1:
-    vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links
-    wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links
-
-    // Paths for step 2:
-    fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // final fat ("X") links
-    longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links
-
-    // Place to accumulate Naiks
-    if (n_naiks > 1) {
-      fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // epsilon fat links
-      longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks
-    }
-
-    // QDP order fields
-    for (int i = 0; i < 4; i++) {
-      qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      if (n_naiks > 1) {
-        qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-        qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      }
-    }
-
-    /////////////////////////////////////////////////////////
-    // Free allocations that are only needed for CPU setup //
-    /////////////////////////////////////////////////////////
-
-    for (int i = 0; i < 4; i++)
-      host_free(qdp_sitelink[i]);
+public:
+  virtual void SetUp() {
+    hisq_stencil_test_wrapper.init_test();
+    display_test_info();
   }
 
-  static void destroy() {
-    if (milc_sitelink) host_free(milc_sitelink);
-
-    for (int i = 0; i < 4; i++) {
-      host_free(fat_reflink[i]);
-      host_free(long_reflink[i]);
-      if (n_naiks > 1) {
-        host_free(fat_reflink_eps[i]);
-        host_free(long_reflink_eps[i]);
-      }
-    }
-
-    // Clean up GPU compute links
-    host_free(vlink);
-    host_free(wlink);
-    host_free(fatlink);
-    host_free(longlink);
-
-    if (n_naiks > 1) {
-      host_free(fatlink_eps);
-      host_free(longlink_eps);
-    }
-
-    for (int i = 0; i < 4; i++) {
-      host_free(qdp_fatlink[i]);
-      host_free(qdp_longlink[i]);
-      if (n_naiks > 1) {
-        host_free(qdp_fatlink_eps[i]);
-        host_free(qdp_longlink_eps[i]);
-      }
-    }
-
-#ifdef MULTI_GPU
-    exchange_llfat_cleanup();
-#endif
+  virtual void TearDown() {
+    hisq_stencil_test_wrapper.end();
   }
 
-  /*--------------------------------------------------------------------*/
-  // Some notation:
-  // U -- original link, SU(3), copied to "field" from "site"
-  // V -- after 1st level of smearing, non-SU(3)
-  // W -- unitarized, SU(3)
-  // X -- after 2nd level of smearing, non-SU(3)
-  /*--------------------------------------------------------------------*/
-
-  double llfatCUDA(int niter) {
-    host_timer_t host_timer;
-
-    comm_barrier();
-    host_timer.start();
-
-    for (int i = 0; i < niter; i++) {
-      // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
-
-      // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
-      computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param);
-
-      if (n_naiks > 1) {
-        // Create Naiks, 3rd path table set
-        computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param);
-
-        // Rescale+copy Naiks into Naik field
-        cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
-        cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size);
-      } else {
-        memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
-        memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
-      }
-
-      // Create X and long links, 2nd path table set
-      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param);
-
-      if (n_naiks > 1) {
-        // Add into Naik field
-        cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size);
-        cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size);
-      }
-    }
-
-    host_timer.stop();
-
-    return host_timer.last();
+  static void SetUpTestCase() {
+    initQuda(device_ordinal);
   }
 
-  void run_test(int niter, bool print_metrics = false) {
-    //////////////////////
-    // Perform GPU test //
-    //////////////////////
-
-    printfQuda("Tuning...\n");
-    llfatCUDA(1);
-
-    auto flops0 = quda::Tunable::flops_global();
-    auto bytes0 = quda::Tunable::bytes_global();
-
-    printfQuda("Running %d iterations of computation\n", niter);
-    double secs = llfatCUDA(niter);
-
-    unsigned long long flops = (quda::Tunable::flops_global() - flops0);
-    unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
-
-    if (print_metrics) {
-      // FIXME: does not include unitarization, extra naiks
-      int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3];
-      //long long flops = 61632 * (long long)niter; // Constructing V field
-      // Constructing W field?
-      // Constructing separate Naiks
-      //flops += 61632 * (long long)niter;     // Constructing X field
-      //flops += (252 * 4) * (long long)niter; // long-link contribution
-
-      printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter);
-
-      printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter,
-                    (flops / niter) / volume, (bytes / niter) / volume);
-
-      double gflops = 1.0e-9 * flops / secs;
-        printfQuda("GFLOPS = %f\n", gflops);
-
-      double gbytes = 1.0e-9 * bytes / secs;
-        printfQuda("GBYTES = %f\n", gbytes);
-
-      // Old metric
-      //double perf = flops / (secs * 1024 * 1024 * 1024);
-      //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf);
-    }
-  }
-
-  void verify()
+  // Per-test-case tear-down.
+  // Called after the last test in this test case.
+  // Can be omitted if not needed.
+  static void TearDownTestCase()
   {
-    ////////////////////////////////////////////////////////////////////
-    // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
-    ////////////////////////////////////////////////////////////////////
-
-    reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-
-    if (n_naiks > 1) {
-      reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-      reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    }
-
-    //////////////////////////////
-    // Perform the verification //
-    //////////////////////////////
-
-    printfQuda("Checking fat links...\n");
-    int res = 1;
-    for (int dir = 0; dir < 4; dir++) {
-      res &= compare_floats(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec);
-    }
-
-    strong_check_link(qdp_fatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
-
-    printfQuda("Fat-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
-
-    printfQuda("Checking long links...\n");
-    res = 1;
-    for (int dir = 0; dir < 4; ++dir) {
-      res &= compare_floats(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec);
-    }
-
-    strong_check_link(qdp_longlink, "GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
-
-    printfQuda("Long-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
-
-    if (n_naiks > 1) {
-
-      printfQuda("Checking fat eps_naik links...\n");
-      res = 1;
-      for (int dir = 0; dir < 4; dir++) {
-        res &= compare_floats(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, 1e-3,
-                              gauge_param.cpu_prec);
-      }
-
-      strong_check_link(qdp_fatlink_eps, "GPU results: ", fat_reflink_eps, "CPU reference results:", V,
-                        gauge_param.cpu_prec);
-
-      printfQuda("Fat-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
-
-      printfQuda("Checking long eps_naik links...\n");
-      res = 1;
-      for (int dir = 0; dir < 4; ++dir) {
-        res &= compare_floats(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, 1e-3,
-                              gauge_param.cpu_prec);
-      }
-
-      strong_check_link(qdp_longlink_eps, "GPU results: ", long_reflink_eps, "CPU reference results:", V,
-                        gauge_param.cpu_prec);
-
-      printfQuda("Long-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
-    }
+    HisqStencilTestWrapper::destroy();
+    endQuda();
   }
 };
 
-static void hisq_test()
+TEST_F(HisqStencilTest, benchmark)
 {
-  initQuda(device_ordinal);
-
-  HisqStencilTestWrapper test_wrapper;
-
-  test_wrapper.init_test();
-
-  test_wrapper.run_test(niter, true);
-
-  test_wrapper.verify();
-
-  test_wrapper.destroy();
-
-  endQuda();
+  hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true);
 }
 
-static void display_test_info()
+TEST_F(HisqStencilTest, verify)
 {
-  printfQuda("running the following test:\n");
+  if (!verify_results) GTEST_SKIP();
+
+  hisq_stencil_test_wrapper.run_test(2);
+
+  std::array<double, 2> res = hisq_stencil_test_wrapper.verify();
 
-  printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       Ordering\n");
-  printfQuda("%s                       %s                         %d/%d/%d/                  %d             %s \n",
-             get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
+  // extra factor of 10 b/c the norm isn't normalized
+  double max_dev = 10. * getTolerance(prec);
 
-  printfQuda("Grid partition info:     X  Y  Z  T\n");
-  printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
-             dimPartitioned(3));
+  // fat link
+  EXPECT_LE(res[0], max_dev);
 
-  printfQuda("Number of Naiks: %d\n", n_naiks);
+  // long link
+  EXPECT_LE(res[1], max_dev);
 }
 
 int main(int argc, char **argv)
 {
+  // initalize google test
+  ::testing::InitGoogleTest(&argc, argv);
+
   // for speed
   xdim = ydim = zdim = tdim = 8;
 
@@ -437,11 +77,8 @@ int main(int argc, char **argv)
   link_recon = QUDA_RECONSTRUCT_NO;
   cpu_prec = prec = QUDA_DOUBLE_PRECISION;
 
+  // Parse command line options
   auto app = make_app();
-  // app->get_formatter()->column_width(40);
-  // add_eigen_option_group(app);
-  // add_deflation_option_group(app);
-  // add_multigrid_option_group(app);
   try {
     app->parse(argc, argv);
   } catch (const CLI::ParseError &e) {
@@ -456,9 +93,15 @@ int main(int argc, char **argv)
   if (eps_naik != 0.0) { n_naiks = 2; }
 
   setVerbosity(verbosity);
-
   initComms(argc, argv, gridsize_from_cmdline);
-  display_test_info();
-  hisq_test();
+
+  // Ensure gtest prints only from rank 0
+  ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
+  if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
+
+  int test_rc = RUN_ALL_TESTS();
+
   finalizeComms();
+
+  return test_rc;
 }
diff --git a/tests/hisq_stencil_test_utils.h b/tests/hisq_stencil_test_utils.h
new file mode 100644
index 0000000000..b4f7512c12
--- /dev/null
+++ b/tests/hisq_stencil_test_utils.h
@@ -0,0 +1,402 @@
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <quda.h>
+#include <gauge_field.h>
+#include <host_utils.h>
+#include <command_line_params.h>
+#include <misc.h>
+#include <unitarization_links.h>
+#include <ks_improved_force.h>
+
+#include <assert.h>
+#include <gtest/gtest.h>
+#include <tune_quda.h>
+
+using namespace quda;
+
+// Number of naiks. If eps_naik is 0.0, we only need
+// to construct one naik.
+static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER;
+
+// The file "generic_ks/fermion_links_hisq_load_milc.c"
+// within MILC is the ultimate reference for what's going on here.
+
+// Unitarization coefficients
+static double unitarize_eps = 1e-6;
+static bool reunit_allow_svd = true;
+static bool reunit_svd_only = false;
+static double svd_rel_error = 1e-4;
+static double svd_abs_error = 1e-4;
+static double max_allowed_error = 1e-11;
+
+struct HisqStencilTestWrapper {
+
+  static inline QudaGaugeParam gauge_param;
+
+  // staple coefficients for different portions of the HISQ stencil build
+  static inline std::array<std::array<double, 6>, 3> act_paths;
+
+  // initial links in MILC order
+  static inline void* milc_sitelink = nullptr;
+
+  // storage for CPU reference fat and long links w/zero Naik
+  static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
+
+  // storage for CPU reference fat and long links w/non-zero Naik
+  static inline void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+
+  // Paths for step 1:
+  static inline void *vlink = nullptr;
+  static inline void *wlink = nullptr;
+
+  // Paths for step 2:
+  static inline void *fatlink = nullptr;
+  static inline void *longlink = nullptr;
+
+  // Place to accumulate Naiks
+  static inline void *fatlink_eps = nullptr;
+  static inline void *longlink_eps = nullptr;
+
+  static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
+
+  void init_test() {
+    cpu_prec = prec;
+    host_gauge_data_type_size = cpu_prec;
+
+    gauge_param = newQudaGaugeParam();
+    setStaggeredGaugeParam(gauge_param);
+
+    static bool first_time = true;
+    if (first_time) {
+      init_host();
+      first_time = false;
+    }
+  }
+
+  void init_host() {
+    setDims(gauge_param.X);
+    dw_setDims(gauge_param.X, 1);
+
+    ///////////////////////////////////////////////////////////////
+    // Set up the coefficients for each part of the HISQ stencil //
+    ///////////////////////////////////////////////////////////////
+
+    // Reference: "generic_ks/imp_actions/hisq/hisq_action.h",
+    // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c
+
+    double u1 = 1.0 / tadpole_factor;
+    double u2 = u1 * u1;
+    double u4 = u2 * u2;
+    double u6 = u4 * u2;
+
+    // First path: create V, W links
+    act_paths[0] = {
+      (1.0 / 8.0),                             /* one link */
+      u2 * (0.0),                              /* Naik */
+      u2 * (-1.0 / 8.0) * 0.5,                 /* simple staple */
+      u4 * (1.0 / 8.0) * 0.25 * 0.5,           /* displace link in two directions */
+      u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */
+      u4 * (0.0)                               /* Lepage term */
+    };
+
+    // Second path: create X, long links
+    act_paths[1] = {
+      ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */
+                                                        /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
+      (-1.0 / 24.0),                                    /* Naik */
+      (-1.0 / 8.0) * 0.5,                               /* simple staple */
+      (1.0 / 8.0) * 0.25 * 0.5,                         /* displace link in two directions */
+      (-1.0 / 8.0) * 0.125 * (1.0 / 6.0),               /* displace link in three directions */
+      (-2.0 / 16.0)                                     /* Lepage term, correct O(a^2) 2x ASQTAD */
+    };
+
+    // Paths for epsilon corrections. Not used if n_naiks = 1.
+    act_paths[2] = {
+      (1.0 / 8.0),   /* one link b/c of Naik */
+      (-1.0 / 24.0), /* Naik */
+      0.0,           /* simple staple */
+      0.0,           /* displace link in two directions */
+      0.0,           /* displace link in three directions */
+      0.0            /* Lepage term */
+    };
+
+    ////////////////////////////////////
+    // Set unitarization coefficients //
+    ////////////////////////////////////
+
+    setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
+                              svd_abs_error);
+
+    /////////////////
+    // Input links //
+    /////////////////
+
+    void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr};
+    for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
+
+    milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+
+    // Note: this could be replaced with loading a gauge field
+    createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases
+    reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+
+    ///////////////////////
+    // Perform CPU Build //
+    ///////////////////////
+
+    for (int i = 0; i < 4; i++) {
+      // fat and long links for fermions with zero epsilon
+      fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+
+      // fat and long links for fermions with non-zero epsilon
+      if (n_naiks > 1) {
+        fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+        long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      }
+    }
+
+    computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param,
+                        act_paths, eps_naik);
+
+    ///////////////////////////////////////////////////////
+    // Allocate host storage for fields built on the GPU //
+    ///////////////////////////////////////////////////////
+
+    // Paths for step 1:
+    vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links
+    wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links
+
+    // Paths for step 2:
+    fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // final fat ("X") links
+    longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links
+
+    // Place to accumulate Naiks
+    if (n_naiks > 1) {
+      fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // epsilon fat links
+      longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks
+    }
+
+    // QDP order fields
+    for (int i = 0; i < 4; i++) {
+      qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      if (n_naiks > 1) {
+        qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+        qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      }
+    }
+
+    /////////////////////////////////////////////////////////
+    // Free allocations that are only needed for CPU setup //
+    /////////////////////////////////////////////////////////
+
+    for (int i = 0; i < 4; i++)
+      host_free(qdp_sitelink[i]);
+
+#ifdef MULTI_GPU
+    exchange_llfat_cleanup();
+#endif
+  }
+
+  static void end() {
+    freeGaugeQuda();
+  }
+
+  static void destroy() {
+    if (milc_sitelink) host_free(milc_sitelink);
+
+    for (int i = 0; i < 4; i++) {
+      host_free(fat_reflink[i]);
+      host_free(long_reflink[i]);
+      if (n_naiks > 1) {
+        host_free(fat_reflink_eps[i]);
+        host_free(long_reflink_eps[i]);
+      }
+    }
+
+    // Clean up GPU compute links
+    host_free(vlink);
+    host_free(wlink);
+    host_free(fatlink);
+    host_free(longlink);
+
+    if (n_naiks > 1) {
+      host_free(fatlink_eps);
+      host_free(longlink_eps);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      host_free(qdp_fatlink[i]);
+      host_free(qdp_longlink[i]);
+      if (n_naiks > 1) {
+        host_free(qdp_fatlink_eps[i]);
+        host_free(qdp_longlink_eps[i]);
+      }
+    }
+  }
+
+  /*--------------------------------------------------------------------*/
+  // Some notation:
+  // U -- original link, SU(3), copied to "field" from "site"
+  // V -- after 1st level of smearing, non-SU(3)
+  // W -- unitarized, SU(3)
+  // X -- after 2nd level of smearing, non-SU(3)
+  /*--------------------------------------------------------------------*/
+
+  double llfatCUDA(int niter) {
+    host_timer_t host_timer;
+
+    comm_barrier();
+    host_timer.start();
+
+    for (int i = 0; i < niter; i++) {
+      // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
+
+      // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
+      computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param);
+
+      if (n_naiks > 1) {
+        // Create Naiks, 3rd path table set
+        computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param);
+
+        // Rescale+copy Naiks into Naik field
+        cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
+        cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size);
+      } else {
+        memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
+        memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
+      }
+
+      // Create X and long links, 2nd path table set
+      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param);
+
+      if (n_naiks > 1) {
+        // Add into Naik field
+        cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size);
+        cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size);
+      }
+    }
+
+    host_timer.stop();
+
+    return host_timer.last();
+  }
+
+  void run_test(int niter, bool print_metrics = false) {
+    //////////////////////
+    // Perform GPU test //
+    //////////////////////
+
+    printfQuda("Tuning...\n");
+    llfatCUDA(1);
+
+    auto flops0 = quda::Tunable::flops_global();
+    auto bytes0 = quda::Tunable::bytes_global();
+
+    printfQuda("Running %d iterations of computation\n", niter);
+    double secs = llfatCUDA(niter);
+
+    unsigned long long flops = (quda::Tunable::flops_global() - flops0);
+    unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
+
+    if (print_metrics) {
+      // FIXME: does not include unitarization, extra naiks
+      int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3];
+      //long long flops = 61632 * (long long)niter; // Constructing V field
+      // Constructing W field?
+      // Constructing separate Naiks
+      //flops += 61632 * (long long)niter;     // Constructing X field
+      //flops += (252 * 4) * (long long)niter; // long-link contribution
+
+      printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter);
+
+      printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter,
+                    (flops / niter) / volume, (bytes / niter) / volume);
+
+      double gflops = 1.0e-9 * flops / secs;
+        printfQuda("GFLOPS = %f\n", gflops);
+
+      double gbytes = 1.0e-9 * bytes / secs;
+        printfQuda("GBYTES = %f\n", gbytes);
+
+      // Old metric
+      //double perf = flops / (secs * 1024 * 1024 * 1024);
+      //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf);
+    }
+  }
+
+  std::array<double, 2> verify()
+  {
+    ////////////////////////////////////////////////////////////////////
+    // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
+    ////////////////////////////////////////////////////////////////////
+
+    reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+
+    if (n_naiks > 1) {
+      reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+      reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    }
+
+    //////////////////////////////
+    // Perform the verification //
+    //////////////////////////////
+
+    std::array<double, 2> res = {0., 0.};
+
+    // extra factor of 10 b/c the norm isn't normalized
+    double max_dev = 10. * getTolerance(prec);
+
+    // Non-zero epsilon check
+    if (n_naiks > 1) {
+      for (int dir = 0; dir < 4; dir++) {
+        res[0] = std::max(res[0],
+          compare_floats_v2(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, max_dev,
+                            gauge_param.cpu_prec));
+      }
+
+      strong_check_link(qdp_fatlink_eps, "Fat link GPU results: ", fat_reflink_eps, "CPU reference results:", V,
+                        gauge_param.cpu_prec);
+
+      for (int dir = 0; dir < 4; ++dir) {
+        res[1] = std::max(res[1],
+          compare_floats_v2(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, max_dev,
+                            gauge_param.cpu_prec));
+      }
+
+      strong_check_link(qdp_longlink_eps, "Long link GPU results: ", long_reflink_eps, "CPU reference results:", V,
+                        gauge_param.cpu_prec);
+    } else {
+      for (int dir = 0; dir < 4; dir++) {
+        res[0] = std::max(res[0],
+          compare_floats_v2(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec));
+      }
+
+      strong_check_link(qdp_fatlink, "Fat link GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
+
+      for (int dir = 0; dir < 4; ++dir) {
+        res[1] = std::max(res[1],
+          compare_floats_v2(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec));
+      }
+
+      strong_check_link(qdp_longlink, "Long link GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
+    }
+
+    printfQuda("Fat link test %s\n", (res[0] < max_dev) ? "PASSED" : "FAILED");
+    printfQuda("Long link test %s\n", (res[1] < max_dev) ? "PASSED" : "FAILED");
+
+    return res;
+
+  }
+};

From eff6773549dbe8f96929edd4e2426883eb7cc54f Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 30 Nov 2023 12:09:34 -0800
Subject: [PATCH 12/53] Created a working hisq_stencil_ctest, woohoo!

---
 tests/CMakeLists.txt            |  14 ++-
 tests/hisq_stencil_ctest.cpp    | 179 ++++++++++++++++++++++++++++++++
 tests/hisq_stencil_test.cpp     |   7 +-
 tests/hisq_stencil_test_utils.h | 151 +++++++++++++++++----------
 4 files changed, 295 insertions(+), 56 deletions(-)
 create mode 100644 tests/hisq_stencil_ctest.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3a95e355e8..135bbd90be 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -230,6 +230,11 @@ if(QUDA_DIRAC_STAGGERED)
   quda_checkbuildtest(hisq_stencil_test QUDA_BUILD_ALL_TESTS)
   install(TARGETS hisq_stencil_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
 
+  add_executable(hisq_stencil_ctest hisq_stencil_ctest.cpp)
+  target_link_libraries(hisq_stencil_ctest ${TEST_LIBS})
+  quda_checkbuildtest(hisq_stencil_ctest QUDA_BUILD_ALL_TESTS)
+  install(TARGETS hisq_stencil_ctest ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
   add_executable(hisq_paths_force_test hisq_paths_force_test.cpp)
   target_link_libraries(hisq_paths_force_test ${TEST_LIBS})
   quda_checkbuildtest(hisq_paths_force_test QUDA_BUILD_ALL_TESTS)
@@ -1289,7 +1294,14 @@ foreach(prec IN LISTS TEST_PRECS)
       --gtest_output=xml:eigensolve_test_mobius_eofa_asym_${prec}.xml)
   endif()
 endforeach(prec)
-  
+
+if(QUDA_DIRAC_STAGGERED)
+  add_test(NAME hisq_stencil
+           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_stencil_ctest> ${MPIEXEC_POSTFLAGS}
+                   --dim 8 8 8 8
+                   --gtest_output=xml:hisq_stencil_test.xml)
+endif()
+
 foreach(prec IN LISTS TEST_PRECS)
 
   add_test(NAME gauge_path_${prec}
diff --git a/tests/hisq_stencil_ctest.cpp b/tests/hisq_stencil_ctest.cpp
new file mode 100644
index 0000000000..55186c6a5f
--- /dev/null
+++ b/tests/hisq_stencil_ctest.cpp
@@ -0,0 +1,179 @@
+#include "hisq_stencil_test_utils.h"
+
+using namespace quda;
+
+bool ctest_all_partitions = false;
+
+using ::testing::Bool;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::TestWithParam;
+using ::testing::Values;
+
+class HisqStencilTest : public ::testing::TestWithParam<::testing::tuple<QudaPrecision, QudaReconstructType, bool, int>>
+{
+protected:
+  ::testing::tuple<QudaPrecision, QudaReconstructType, bool, int> param;
+
+  HisqStencilTestWrapper hisq_stencil_test_wrapper;
+
+  bool skip()
+  {
+    QudaPrecision precision = static_cast<QudaPrecision>(::testing::get<0>(GetParam()));
+    QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
+
+    if ((QUDA_PRECISION & precision) == 0
+        || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0)
+      return true;
+
+    const std::array<bool, 16> partition_enabled {true, true, true,  false,  true,  false, false, false,
+                                                  true, false, false, false, true, false, true, true};
+    if (!ctest_all_partitions && !partition_enabled[::testing::get<3>(GetParam())]) return true;
+
+    return false;
+  }
+
+  void display_test_info(QudaPrecision prec, QudaReconstructType link_recon, bool has_naik) {
+    printfQuda("running the following test:\n");
+    printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       Ordering\n");
+    printfQuda("%s                       %s                         %d/%d/%d/                  %d             %s \n",
+              get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
+    printfQuda("Grid partition info:     X  Y  Z  T\n");
+    printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
+              dimPartitioned(3));
+    printfQuda("Number of Naiks: %d\n", has_naik ? 2 : 1);
+  }
+
+public:
+  virtual void SetUp() {
+    QudaPrecision prec = static_cast<QudaPrecision>(::testing::get<0>(GetParam()));
+    QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
+    bool has_naik = ::testing::get<2>(GetParam());
+
+    if (skip()) GTEST_SKIP();
+
+    int partition = ::testing::get<3>(GetParam());
+    for (int j = 0; j < 4; j++) {
+      if (partition & (1 << j)) { commDimPartitionedSet(j); }
+    }
+    updateR();
+
+    hisq_stencil_test_wrapper.init_ctest(prec, recon, has_naik);
+    display_test_info(prec, recon, has_naik);
+  }
+
+  virtual void TearDown() {
+    if (skip()) GTEST_SKIP();
+    hisq_stencil_test_wrapper.end();
+  }
+
+  static void SetUpTestCase() {
+    initQuda(device_ordinal);
+  }
+
+  // Per-test-case tear-down.
+  // Called after the last test in this test case.
+  // Can be omitted if not needed.
+  static void TearDownTestCase()
+  {
+    HisqStencilTestWrapper::destroy();
+    endQuda();
+  }
+};
+
+TEST_P(HisqStencilTest, benchmark)
+{
+  hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true);
+}
+
+TEST_P(HisqStencilTest, verify)
+{
+  hisq_stencil_test_wrapper.run_test(2);
+
+  std::array<double, 2> res = hisq_stencil_test_wrapper.verify();
+
+  // extra factor of 10 b/c the norm isn't normalized
+  double max_dev = 10. * getTolerance(prec);
+
+  // fat link
+  EXPECT_LE(res[0], max_dev) << "Reference CPU and QUDA implementations of fat link do not agree";
+
+  // long link
+  EXPECT_LE(res[1], max_dev) << "Reference CPU and QUDA implementations of long link do not agree";
+}
+
+int main(int argc, char **argv)
+{
+  // initalize google test
+  ::testing::InitGoogleTest(&argc, argv);
+
+  // for speed
+  xdim = ydim = zdim = tdim = 8;
+
+  // default to 18 reconstruct
+  link_recon = QUDA_RECONSTRUCT_NO;
+  cpu_prec = prec = QUDA_DOUBLE_PRECISION;
+
+  // Parse command line options
+  auto app = make_app();
+  app->add_option("--all-partitions", ctest_all_partitions, "Test all instead of reduced combination of partitions");
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
+
+  if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION)
+    errorQuda("Precision %d is unsupported in some link fattening routines\n", prec);
+
+  if (link_recon != QUDA_RECONSTRUCT_NO)
+    errorQuda("Reconstruct %d is unsupported in some link fattening routines\n", link_recon);
+
+  if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order);
+
+  if (eps_naik != 0.0) { n_naiks = 2; }
+
+  setVerbosity(verbosity);
+  initComms(argc, argv, gridsize_from_cmdline);
+
+  // Ensure gtest prints only from rank 0
+  ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
+  if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
+
+  int test_rc = RUN_ALL_TESTS();
+
+  finalizeComms();
+
+  return test_rc;
+}
+
+std::string gethisqstenciltestname(testing::TestParamInfo<::testing::tuple<QudaPrecision, QudaReconstructType, bool, int>> param)
+{
+  const QudaPrecision prec = static_cast<QudaPrecision>(::testing::get<0>(param.param));
+  const QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(param.param));
+  const bool has_naik = ::testing::get<2>(param.param);
+  const int part = ::testing::get<3>(param.param);
+  std::stringstream ss;
+  // ss << get_dslash_str(dslash_type) << "_";
+  ss << get_prec_str(prec);
+  ss << "_r" << recon;
+  if (has_naik) ss << "_naik";
+  ss << "_partition" << part;
+  return ss.str();
+}
+
+#ifdef MULTI_GPU
+INSTANTIATE_TEST_SUITE_P(QUDA, HisqStencilTest,
+                         Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION),
+                                 ::testing::Values(QUDA_RECONSTRUCT_NO),
+                                 ::testing::Bool(),
+                                 Range(0, 16)),
+                         gethisqstenciltestname);
+#else
+INSTANTIATE_TEST_SUITE_P(QUDA, HisqStencilTest,
+                         Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION),
+                                 ::testing::Values(QUDA_RECONSTRUCT_NO),
+                                 ::testing::Bool(),
+                                 ::testing::Values(0)),
+                         gethisqstenciltestname);
+#endif
diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp
index 138bba9eea..e42dc7b804 100644
--- a/tests/hisq_stencil_test.cpp
+++ b/tests/hisq_stencil_test.cpp
@@ -59,10 +59,10 @@ TEST_F(HisqStencilTest, verify)
   double max_dev = 10. * getTolerance(prec);
 
   // fat link
-  EXPECT_LE(res[0], max_dev);
+  EXPECT_LE(res[0], max_dev) << "Reference CPU and QUDA implementations of fat link do not agree";
 
   // long link
-  EXPECT_LE(res[1], max_dev);
+  EXPECT_LE(res[1], max_dev) << "Reference CPU and QUDA implementations of long link do not agree";
 }
 
 int main(int argc, char **argv)
@@ -88,6 +88,9 @@ int main(int argc, char **argv)
   if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION)
     errorQuda("Precision %d is unsupported in some link fattening routines\n", prec);
 
+  if (link_recon != QUDA_RECONSTRUCT_NO)
+    errorQuda("Reconstruct %d is unsupported in some link fattening routines\n", link_recon);
+
   if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order);
 
   if (eps_naik != 0.0) { n_naiks = 2; }
diff --git a/tests/hisq_stencil_test_utils.h b/tests/hisq_stencil_test_utils.h
index b4f7512c12..4c1f3cfcc3 100644
--- a/tests/hisq_stencil_test_utils.h
+++ b/tests/hisq_stencil_test_utils.h
@@ -63,23 +63,54 @@ struct HisqStencilTestWrapper {
   static inline void *fatlink_eps = nullptr;
   static inline void *longlink_eps = nullptr;
 
+  static inline void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr};
   static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
   static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
   static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
   static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
 
-  void init_test() {
-    cpu_prec = prec;
-    host_gauge_data_type_size = cpu_prec;
+  void set_naik(bool has_naik) {
+    if (has_naik) {
+      eps_naik = -0.03; // semi-arbitrary
+      n_naiks = 2;
+    } else {
+      eps_naik = 0.0;
+      n_naiks = 1;
+    }
+  }
+
+  void init_ctest(QudaPrecision prec_, QudaReconstructType link_recon_, bool has_naik) {
+    prec = prec_;
+    link_recon = link_recon_;
+
+    set_naik(has_naik);
 
     gauge_param = newQudaGaugeParam();
     setStaggeredGaugeParam(gauge_param);
 
+    gauge_param.cuda_prec = prec;
+
     static bool first_time = true;
     if (first_time) {
+      // force the Naik build up front, it doesn't effect the non-naik fields
+      set_naik(true);
       init_host();
+      set_naik(has_naik);
       first_time = false;
     }
+    init();
+  }
+
+  void init_test() {
+    gauge_param = newQudaGaugeParam();
+    setStaggeredGaugeParam(gauge_param);
+
+    static bool first_time = true;
+    if (first_time) {
+      init_host();
+      first_time = false;
+    }
+    init();
   }
 
   void init_host() {
@@ -140,14 +171,10 @@ struct HisqStencilTestWrapper {
     // Input links //
     /////////////////
 
-    void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr};
     for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
 
-    milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-
     // Note: this could be replaced with loading a gauge field
     createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases
-    reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
 
     ///////////////////////
     // Perform CPU Build //
@@ -168,23 +195,9 @@ struct HisqStencilTestWrapper {
     computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param,
                         act_paths, eps_naik);
 
-    ///////////////////////////////////////////////////////
-    // Allocate host storage for fields built on the GPU //
-    ///////////////////////////////////////////////////////
-
-    // Paths for step 1:
-    vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links
-    wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links
-
-    // Paths for step 2:
-    fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // final fat ("X") links
-    longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links
-
-    // Place to accumulate Naiks
-    if (n_naiks > 1) {
-      fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);  // epsilon fat links
-      longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks
-    }
+    /////////////////////////////////////////////////////////////////////
+    // Allocate CPU-precision host storage for fields built on the GPU //
+    /////////////////////////////////////////////////////////////////////
 
     // QDP order fields
     for (int i = 0; i < 4; i++) {
@@ -196,24 +209,60 @@ struct HisqStencilTestWrapper {
       }
     }
 
-    /////////////////////////////////////////////////////////
-    // Free allocations that are only needed for CPU setup //
-    /////////////////////////////////////////////////////////
-
-    for (int i = 0; i < 4; i++)
-      host_free(qdp_sitelink[i]);
-
 #ifdef MULTI_GPU
     exchange_llfat_cleanup();
 #endif
   }
 
+  void init() {
+
+    // reset the reconstruct in gauge param
+    gauge_param.reconstruct = link_recon;
+
+    /////////////////////////////////////////////////////////////////
+    // Create a CPU copy of the initial field in the GPU precision //
+    /////////////////////////////////////////////////////////////////
+
+    milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec);
+    reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cuda_prec, gauge_param.cpu_prec);
+
+    ///////////////////////////////////////////////////////
+    // Allocate host storage for fields built on the GPU //
+    ///////////////////////////////////////////////////////
+
+    // Paths for step 1:
+    vlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // V links
+    wlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // W links
+
+    // Paths for step 2:
+    fatlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec);  // final fat ("X") links
+    longlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // final long links
+
+    // Place to accumulate Naiks
+    if (n_naiks > 1) {
+      fatlink_eps = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec);  // epsilon fat links
+      longlink_eps = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // epsilon long naiks
+    }
+  }
+
   static void end() {
+    if (milc_sitelink) host_free(milc_sitelink);
+
+    // Clean up GPU compute links
+    if (vlink) host_free(vlink);
+    if (wlink) host_free(wlink);
+    if (fatlink) host_free(fatlink);
+    if (longlink) host_free(longlink);
+
+    if (n_naiks > 1) {
+      if (fatlink_eps) host_free(fatlink_eps);
+      if (longlink_eps) host_free(longlink_eps);
+    }
+
     freeGaugeQuda();
   }
 
   static void destroy() {
-    if (milc_sitelink) host_free(milc_sitelink);
 
     for (int i = 0; i < 4; i++) {
       host_free(fat_reflink[i]);
@@ -224,18 +273,8 @@ struct HisqStencilTestWrapper {
       }
     }
 
-    // Clean up GPU compute links
-    host_free(vlink);
-    host_free(wlink);
-    host_free(fatlink);
-    host_free(longlink);
-
-    if (n_naiks > 1) {
-      host_free(fatlink_eps);
-      host_free(longlink_eps);
-    }
-
     for (int i = 0; i < 4; i++) {
+      host_free(qdp_sitelink[i]);
       host_free(qdp_fatlink[i]);
       host_free(qdp_longlink[i]);
       if (n_naiks > 1) {
@@ -259,6 +298,10 @@ struct HisqStencilTestWrapper {
     comm_barrier();
     host_timer.start();
 
+    // manually override precision of input fields
+    auto cpu_param_backup = gauge_param.cpu_prec;
+    gauge_param.cpu_prec = gauge_param.cuda_prec;
+
     for (int i = 0; i < niter; i++) {
       // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
 
@@ -270,11 +313,11 @@ struct HisqStencilTestWrapper {
         computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param);
 
         // Rescale+copy Naiks into Naik field
-        cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
-        cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size);
+        cpu_axy(gauge_param.cuda_prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
+        cpu_axy(gauge_param.cuda_prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size);
       } else {
-        memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
-        memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
+        memset(fatlink, 0, V * 4 * gauge_site_size * gauge_param.cuda_prec);
+        memset(longlink, 0, V * 4 * gauge_site_size * gauge_param.cuda_prec);
       }
 
       // Create X and long links, 2nd path table set
@@ -282,11 +325,13 @@ struct HisqStencilTestWrapper {
 
       if (n_naiks > 1) {
         // Add into Naik field
-        cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size);
-        cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size);
+        cpu_xpy(gauge_param.cuda_prec, fatlink, fatlink_eps, V * 4 * gauge_site_size);
+        cpu_xpy(gauge_param.cuda_prec, longlink, longlink_eps, V * 4 * gauge_site_size);
       }
     }
 
+    gauge_param.cpu_prec = cpu_param_backup;
+
     host_timer.stop();
 
     return host_timer.last();
@@ -341,12 +386,12 @@ struct HisqStencilTestWrapper {
     // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
     ////////////////////////////////////////////////////////////////////
 
-    reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+    reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec);
+    reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec);
 
     if (n_naiks > 1) {
-      reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-      reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+      reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec);
+      reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec);
     }
 
     //////////////////////////////

From cb940215fe4d83972b49fc0d47ffce5846ab377a Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 30 Nov 2023 15:03:30 -0800
Subject: [PATCH 13/53] Some cleanup of staggered_invert_test, working towards
 a ctest

---
 tests/staggered_invert_test.cpp | 85 ++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 27 deletions(-)

diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 67e96507c1..c039482c94 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -18,6 +18,14 @@
 #include <staggered_gauge_utils.h>
 #include <llfat_utils.h>
 
+QudaGaugeParam gauge_param;
+QudaInvertParam inv_param;
+QudaMultigridParam mg_param;
+QudaInvertParam mg_inv_param;
+QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL];
+QudaEigParam eig_param;
+bool use_split_grid = false;
+
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 void display_test_info()
@@ -102,33 +110,28 @@ void display_test_info()
              dimPartitioned(3));
 }
 
-void test(int, char**)
-{
+GaugeField cpuFatQDP = {};
+GaugeField cpuLongQDP = {};
+GaugeField cpuFatMILC = {};
+GaugeField cpuLongMILC = {};
 
+void init()
+{
   // Set QUDA internal parameters
-  QudaGaugeParam gauge_param = newQudaGaugeParam();
-  QudaInvertParam inv_param = newQudaInvertParam();
+  gauge_param = newQudaGaugeParam();
   setStaggeredGaugeParam(gauge_param);
-  if (!inv_multigrid) setStaggeredInvertParam(inv_param);
-
-  QudaInvertParam mg_inv_param = newQudaInvertParam();
-  QudaMultigridParam mg_param = newQudaMultigridParam();
-  QudaEigParam mg_eig_param[mg_levels];
 
-  // params related to split grid.
-  for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
-  int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
-  bool use_split_grid = num_sub_partition > 1;
+  inv_param = newQudaInvertParam();
+  mg_inv_param = newQudaInvertParam();
+  mg_param = newQudaMultigridParam();
+  eig_param = newQudaEigParam();
 
   if (inv_multigrid) {
-
     // Set some default values for MG solve types
     setQudaMgSolveTypes();
-
     setStaggeredMGInvertParam(inv_param);
     // Set sub structures
     mg_param.invert_param = &mg_inv_param;
-
     for (int i = 0; i < mg_levels; i++) {
       if (mg_eig[i]) {
         mg_eig_param[i] = newQudaEigParam();
@@ -138,10 +141,12 @@ void test(int, char**)
         mg_param.eig_param[i] = nullptr;
       }
     }
+    // Set MG
     setStaggeredMultigridParam(mg_param);
+  } else {
+    setStaggeredInvertParam(inv_param);
   }
 
-  QudaEigParam eig_param = newQudaEigParam();
   if (inv_deflate) {
     setEigParam(eig_param);
     inv_param.eig_param = &eig_param;
@@ -168,16 +173,16 @@ void test(int, char**)
   cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuParam.order = QUDA_QDP_GAUGE_ORDER;
   GaugeField cpuIn = GaugeField(cpuParam);
-  GaugeField cpuFatQDP = GaugeField(cpuParam);
+  cpuFatQDP = GaugeField(cpuParam);
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
-  GaugeField cpuFatMILC = GaugeField(cpuParam);
+  cpuFatMILC = GaugeField(cpuParam);
 
   cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS;
   cpuParam.nFace = 3;
   cpuParam.order = QUDA_QDP_GAUGE_ORDER;
-  GaugeField cpuLongQDP = GaugeField(cpuParam);
+  cpuLongQDP = GaugeField(cpuParam);
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
-  GaugeField cpuLongMILC = GaugeField(cpuParam);
+  cpuLongMILC = GaugeField(cpuParam);
 
   void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
   void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
@@ -212,6 +217,14 @@ void test(int, char**)
 
   // Staggered Gauge construct END
   //-----------------------------------------------------------------------------------
+}
+
+std::vector<double> solve()
+{
+  // params related to split grid.
+  for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
+  int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
+  bool use_split_grid = num_sub_partition > 1;
 
   // Setup the multigrid preconditioner
   void *mg_preconditioner = nullptr;
@@ -252,6 +265,8 @@ void test(int, char**)
   // QUDA invert test
   //----------------------------------------------------------------------------
 
+  std::vector<double> res(Nsrc);
+
   if (multishift == 1) {
     if (!use_split_grid) {
       for (int k = 0; k < Nsrc; k++) {
@@ -285,7 +300,7 @@ void test(int, char**)
 
     for (int k = 0; k < Nsrc; k++) {
       if (verify_results)
-        verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, inv_param, 0);
+        res[k] = verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, inv_param, 0);
     }
   } else if (multishift > 1) {
     if (use_split_grid)
@@ -332,20 +347,32 @@ void test(int, char**)
       printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs,
                  inv_param.gflops / inv_param.secs);
 
+
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i);
+        auto resid = verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i);
+        if (i == 0) res[k] = resid;
       }
     }
   } else {
     errorQuda("Invalid number of shifts %d", multishift);
-  } // switch
+  }
+
+  // Free the multigrid solver
+  if (inv_multigrid) destroyMultigridQuda(mg_preconditioner);
 
   // Compute timings
   if (Nsrc > 1 && !use_split_grid) performanceStats(time, gflops, iter);
 
-  // Free the multigrid solver
-  if (inv_multigrid) destroyMultigridQuda(mg_preconditioner);
+  return res;
+}
+
+void cleanup()
+{
+  cpuFatQDP = {};
+  cpuLongQDP = {};
+  cpuFatMILC = {};
+  cpuLongMILC = {};
 }
 
 int main(int argc, char **argv)
@@ -392,7 +419,11 @@ int main(int argc, char **argv)
 
   initQuda(device_ordinal);
 
-  test(argc, argv);
+  init();
+
+  solve();
+
+  cleanup();
 
   // Finalize the QUDA library
   endQuda();

From b3508beac5ae2be9c58d48fcc3879321faebf6f5 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 30 Nov 2023 20:12:49 -0800
Subject: [PATCH 14/53] Added a mostly working gtest!

---
 tests/host_reference/dslash_reference.cpp |   4 +-
 tests/host_reference/dslash_reference.h   |   2 +-
 tests/staggered_invert_test.cpp           |  67 +++++-
 tests/staggered_invert_test_gtest.hpp     | 242 ++++++++++++++++++++++
 4 files changed, 303 insertions(+), 12 deletions(-)
 create mode 100644 tests/staggered_invert_test_gtest.hpp

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index a5685ed8fe..c2db9993f8 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -743,7 +743,7 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
   return l2r;
 }
 
-double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
                                 quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link,
                                 QudaInvertParam &inv_param, int shift)
 {
@@ -798,5 +798,5 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
     }
   }
 
-  return l2r;
+  return {l2r, inv_param.tol_hq};
 }
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 8291dc688d..b17238bac4 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -109,7 +109,7 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 void *spinorCheck, QudaGaugeParam &gauge_param,
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
-double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
                                 quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link,
                                 QudaInvertParam &inv_param, int shift);
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index c039482c94..dba2813916 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -26,7 +26,8 @@ QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL];
 QudaEigParam eig_param;
 bool use_split_grid = false;
 
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
+// if --enable-testing true is passed, we run the tests defined in here
+#include <staggered_invert_test_gtest.hpp>
 
 void display_test_info()
 {
@@ -219,8 +220,30 @@ void init()
   //-----------------------------------------------------------------------------------
 }
 
-std::vector<double> solve()
+std::vector<std::array<double, 2>> solve(test_t param)
 {
+  inv_param.inv_type = ::testing::get<0>(param);
+  inv_param.solution_type = ::testing::get<1>(param);
+  inv_param.solve_type = ::testing::get<2>(param);
+  inv_param.cuda_prec_sloppy = ::testing::get<3>(param);
+  multishift = ::testing::get<4>(param);
+  inv_param.solution_accumulator_pipeline = ::testing::get<5>(param);
+
+  // schwarz parameters
+  auto schwarz_param = ::testing::get<6>(param);
+  inv_param.schwarz_type           = ::testing::get<0>(schwarz_param);
+  inv_param.inv_type_precondition  = ::testing::get<1>(schwarz_param);
+  inv_param.cuda_prec_precondition = ::testing::get<2>(schwarz_param);
+
+  inv_param.residual_type = ::testing::get<7>(param);
+
+  // reset lambda_max if we're doing a testing loop to ensure correct lambma_max
+  if (enable_testing) inv_param.ca_lambda_max = -1.0;
+
+  logQuda(QUDA_SUMMARIZE, "Solution = %s, Solve = %s, Solver = %s, Sloppy precision = %s\n",
+          get_solution_str(inv_param.solution_type), get_solve_str(inv_param.solve_type),
+          get_solver_str(inv_param.inv_type), get_prec_str(inv_param.cuda_prec_sloppy));
+
   // params related to split grid.
   for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
   int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
@@ -265,7 +288,7 @@ std::vector<double> solve()
   // QUDA invert test
   //----------------------------------------------------------------------------
 
-  std::vector<double> res(Nsrc);
+  std::vector<std::array<double, 2>> res(Nsrc);
 
   if (multishift == 1) {
     if (!use_split_grid) {
@@ -351,7 +374,13 @@ std::vector<double> solve()
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
         auto resid = verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i);
-        if (i == 0) res[k] = resid;
+
+        // take the HQ residual from the lightest mass
+        if (i == 0) {
+          res[k] = resid;
+        } else {
+          if (resid[0] > res[k][0]) res[k][0] = resid[0];
+        }
       }
     }
   } else {
@@ -377,6 +406,7 @@ void cleanup()
 
 int main(int argc, char **argv)
 {
+  ::testing::InitGoogleTest(&argc, argv);
   setQudaStaggeredDefaultInvTestParams();
   setQudaDefaultMgTestParams();
   // Parse command line options
@@ -385,7 +415,7 @@ int main(int argc, char **argv)
   add_deflation_option_group(app);
   add_multigrid_option_group(app);
   add_comms_option_group(app);
-
+  add_testing_option_group(app);
   try {
     app->parse(argc, argv);
   } catch (const CLI::ParseError &e) {
@@ -419,17 +449,36 @@ int main(int argc, char **argv)
 
   initQuda(device_ordinal);
 
+  // need force a well-behaved operator + reasonable convergence
+  if (enable_testing) {
+    compute_fatlong = true;
+    mass = 0.32; // yes, it's a magic number
+    tol = 1e-6;
+    tol_hq = 1e-6;
+    //niter = 500; // the staggered spectrum is rough
+  }
+
   init();
 
-  solve();
+  int result = 0;
+  if (enable_testing) { // tests are defined in staggered_invert_test_gtest.hpp
+    ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
+    if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      errorQuda("Staggered ctest doesn't support the Laplace operator (yet)");
+    result = RUN_ALL_TESTS();
+  } else {
+    solve(test_t {inv_type, solution_type, solve_type, prec_sloppy, multishift, solution_accumulator_pipeline,
+                  schwarz_t {precon_schwarz_type, inv_multigrid ? QUDA_MG_INVERTER : precon_type, prec_precondition},
+                  inv_param.residual_type});
+  }
 
   cleanup();
 
   // Finalize the QUDA library
+  freeGaugeQuda();
   endQuda();
-
-  // Finalize the communications layer
   finalizeComms();
 
-  return 0;
+  return result;
 }
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
new file mode 100644
index 0000000000..4eee2b37ee
--- /dev/null
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -0,0 +1,242 @@
+#include <gtest/gtest.h>
+#include <quda_arch.h>
+
+// tuple containing parameters for Schwarz solver
+using schwarz_t = ::testing::tuple<QudaSchwarzType, QudaInverterType, QudaPrecision>;
+
+using test_t
+  = ::testing::tuple<QudaInverterType, QudaSolutionType, QudaSolveType, QudaPrecision, int, int, schwarz_t, QudaResidualType>;
+
+class StaggeredInvertTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  test_t param;
+
+public:
+  StaggeredInvertTest() : param(GetParam()) { }
+};
+
+bool is_normal_residual(QudaInverterType type)
+{
+  switch (type) {
+  case QUDA_CGNR_INVERTER:
+  case QUDA_CA_CGNR_INVERTER: return true;
+  default: return false;
+  }
+}
+
+bool is_preconditioned_solve(QudaSolveType type)
+{
+  switch (type) {
+  case QUDA_DIRECT_PC_SOLVE:
+  case QUDA_NORMOP_PC_SOLVE: return true;
+  default: return false;
+  }
+}
+
+bool is_full_solution(QudaSolutionType type)
+{
+  switch (type) {
+  case QUDA_MAT_SOLUTION:
+  case QUDA_MATDAG_MAT_SOLUTION: return true;
+  default: return false;
+  }
+}
+
+bool is_normal_solve(test_t param)
+{
+  auto inv_type = ::testing::get<0>(param);
+  auto solve_type = ::testing::get<2>(param);
+
+  switch (solve_type) {
+  case QUDA_NORMOP_SOLVE:
+  case QUDA_NORMOP_PC_SOLVE: return true;
+  default:
+    switch (inv_type) {
+    case QUDA_CGNR_INVERTER:
+    case QUDA_CGNE_INVERTER:
+    case QUDA_CA_CGNR_INVERTER:
+    case QUDA_CA_CGNE_INVERTER: return true;
+    default: return false;
+    }
+  }
+}
+
+bool support_solution_accumulator_pipeline(QudaInverterType type)
+{
+  switch (type) {
+  case QUDA_CG_INVERTER:
+  case QUDA_CA_CG_INVERTER:
+  case QUDA_CGNR_INVERTER:
+  case QUDA_CGNE_INVERTER:
+  case QUDA_PCG_INVERTER: return true;
+  default: return false;
+  }
+}
+
+bool skip_test(test_t param)
+{
+  auto inverter_type = ::testing::get<0>(param);
+  auto solution_type = ::testing::get<1>(param);
+  auto prec_sloppy = ::testing::get<3>(param);
+  auto multishift = ::testing::get<4>(param);
+  auto solution_accumulator_pipeline = ::testing::get<5>(param);
+  auto schwarz_param = ::testing::get<6>(param);
+  auto prec_precondition = ::testing::get<2>(schwarz_param);
+
+  if (prec < prec_sloppy) return true;              // outer precision >= sloppy precision
+  if (!(QUDA_PRECISION & prec_sloppy)) return true; // precision not enabled so skip it
+  if (!(QUDA_PRECISION & prec_precondition) && prec_precondition != QUDA_INVALID_PRECISION)
+    return true; // precision not enabled so skip it
+  if (prec_sloppy < prec_precondition) return true; // sloppy precision >= preconditioner precision
+
+  // Skip if the inverter does not support batched update and batched update is greater than one
+  if (!support_solution_accumulator_pipeline(inverter_type) && solution_accumulator_pipeline > 1) return true;
+  // There's no MLocal or MdagMLocal support yet, this is left in for reference
+  //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ)
+  //  if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true;
+
+  // split-grid doesn't support split-grid at present
+  if (use_split_grid && multishift > 1) return true;
+
+  return false;
+}
+
+std::vector<std::array<double, 2>> solve(test_t param);
+
+TEST_P(StaggeredInvertTest, verify)
+{
+  if (skip_test(GetParam())) GTEST_SKIP();
+
+  inv_param.tol = 0.0;
+  inv_param.tol_hq = 0.0;
+  auto res_t = ::testing::get<7>(GetParam());
+  if (res_t & QUDA_L2_RELATIVE_RESIDUAL) inv_param.tol = tol;
+  if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq;
+
+  inv_param.reliable_delta = reliable_delta;
+
+  auto tol = inv_param.tol;
+  // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this
+  if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50;
+  // Slight loss of precision possible when reconstructing full solution
+  if (is_full_solution(::testing::get<1>(GetParam())) && is_preconditioned_solve(::testing::get<2>(GetParam())))
+    tol *= 10;
+
+  // Slight loss of precision seems to be possible in single precision
+  // with the asqtad operator, though it looks like it's because of the
+  // fat/long links going through a few precision conversions here and there
+  if (dslash_type == QUDA_ASQTAD_DSLASH && prec <= QUDA_SINGLE_PRECISION)
+    tol *= 1.1;
+
+  for (auto rsd : solve(GetParam())) {
+    if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); }
+    if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); }
+  }
+}
+
+std::string gettestname(::testing::TestParamInfo<test_t> param)
+{
+  std::string name;
+  name += get_solver_str(::testing::get<0>(param.param)) + std::string("_");
+  name += get_solution_str(::testing::get<1>(param.param)) + std::string("_");
+  name += get_solve_str(::testing::get<2>(param.param)) + std::string("_");
+  name += get_prec_str(::testing::get<3>(param.param));
+  if (::testing::get<4>(param.param) > 1)
+    name += std::string("_shift") + std::to_string(::testing::get<4>(param.param));
+  if (::testing::get<5>(param.param) > 1)
+    name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<5>(param.param));
+  auto &schwarz_param = ::testing::get<6>(param.param);
+  if (::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) {
+    name += std::string("_") + get_schwarz_str(::testing::get<0>(schwarz_param));
+    name += std::string("_") + get_solver_str(::testing::get<1>(schwarz_param));
+    name += std::string("_") + get_prec_str(::testing::get<2>(schwarz_param));
+  }
+  auto res_t = ::testing::get<7>(param.param);
+  if (res_t & QUDA_L2_RELATIVE_RESIDUAL) name += std::string("_l2");
+  if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) name += std::string("_heavy_quark");
+  return name;
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+
+auto staggered_pc_solvers
+  = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER, QUDA_GCR_INVERTER,
+           QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
+
+auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER);
+
+// The spectrum of the staggered operator means MR has a miserable time converging,
+// it's not MR's fault. Other solvers have troubles too, which I need to think through.
+//auto direct_solvers
+//  = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER,
+//           QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
+
+auto direct_solvers
+  = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_BICGSTABL_INVERTER);
+
+auto sloppy_precisions
+  = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION);
+
+auto solution_accumulator_pipelines = Values(1, 8);
+
+auto no_schwarz = Combine(Values(QUDA_INVALID_SCHWARZ), Values(QUDA_INVALID_INVERTER), Values(QUDA_INVALID_PRECISION));
+
+auto no_heavy_quark = Values(QUDA_L2_RELATIVE_RESIDUAL);
+
+// the staggered PC op doesn't support "normal" operators since it's already
+// Hermitian positive definite
+
+// preconditioned solves
+INSTANTIATE_TEST_SUITE_P(EvenOdd, StaggeredInvertTest,
+                         Combine(staggered_pc_solvers, Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION),
+                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1),
+                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         gettestname);
+
+// full system normal solve
+INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredInvertTest,
+                         Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE),
+                                 sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         gettestname);
+
+// full system direct solve
+INSTANTIATE_TEST_SUITE_P(Full, StaggeredInvertTest,
+                         Combine(direct_solvers, Values(QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_SOLVE), sloppy_precisions,
+                                 Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         gettestname);
+
+// preconditioned multi-shift solves
+INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest,
+                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
+                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(10),
+                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         gettestname);
+
+// Schwarz-preconditioned normal solves
+//INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest,
+//                         Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION),
+//                                 Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1),
+//                                 solution_accumulator_pipelines,
+//                                 Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER),
+//                                         Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)),
+//                                 no_heavy_quark),
+//                         gettestname);
+
+// Schwarz-preconditioned direct solves
+//INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, StaggeredInvertTest,
+//                         Combine(Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
+//                                 sloppy_precisions, Values(1), solution_accumulator_pipelines,
+//                                 Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_MR_INVERTER, QUDA_CA_GCR_INVERTER),
+//                                         Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)),
+//                                 no_heavy_quark),
+//                         gettestname);
+
+// Heavy-Quark preconditioned solves
+INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest,
+                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
+                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1),
+                                 solution_accumulator_pipelines, no_schwarz,
+                                 Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)),
+                         gettestname);

From 5aa628fe0695c169151ea2e6527f4c53715f6648 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 30 Nov 2023 22:02:22 -0800
Subject: [PATCH 15/53] More fully pipecleaned staggered ctest; split grid
 testing outstanding

---
 tests/CMakeLists.txt                  | 45 ++++++++++++++++++-
 tests/staggered_invert_test.cpp       | 28 +++++++++---
 tests/staggered_invert_test_gtest.hpp | 64 ++++++++++++++++-----------
 3 files changed, 104 insertions(+), 33 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 135bbd90be..3fea1f68b4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -953,7 +953,7 @@ elseif(single_prec)
   set(TEST_PRECS single)
 endif()
 
-# Inversions
+# Wilson-type Inversions
 foreach(prec IN LISTS TEST_PRECS)
 
   if(${prec} STREQUAL "double")
@@ -1128,6 +1128,49 @@ foreach(prec IN LISTS TEST_PRECS)
   endif()
 endforeach(prec)
 
+# Staggered-type Inversions
+foreach(prec IN LISTS TEST_PRECS)
+
+  # These require looser tolerances to keep iterations to solution in check
+  if(${prec} STREQUAL "double")
+    set(tol 1e-6)
+  elseif(${prec} STREQUAL "single")
+    set(tol 1e-5)
+  endif()
+
+  if(QUDA_DIRAC_STAGGERED)
+    # --compute-fat-long true is necessary to get well-behaved fields
+
+    add_test(NAME invert_test_staggered_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
+      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+      --enable-testing true
+      --gtest_output=xml:invert_test_staggered_${prec}.xml)
+
+#      if(DEFINED ENV{QUDA_ENABLE_TUNING})
+#        if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
+#          add_test(NAME invert_test_splitgrid_wilson_${prec}
+#            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+#            --dslash-type wilson --ngcrkrylov 8
+#            --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+#            --nsrc ${QUDA_TEST_NUM_PROCS}
+#            --enable-testing true
+#            --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml)
+#
+#          set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+#        endif()
+#      endif()
+
+    add_test(NAME invert_test_asqtad_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
+      --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+      --enable-testing true
+      --gtest_output=xml:invert_test_asqtad_${prec}.xml)
+  endif()
+endforeach(prec)
+
 # Eigensolves
 foreach(prec IN LISTS TEST_PRECS)
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index dba2813916..3299f0d84f 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -449,13 +449,29 @@ int main(int argc, char **argv)
 
   initQuda(device_ordinal);
 
-  // need force a well-behaved operator + reasonable convergence
   if (enable_testing) {
-    compute_fatlong = true;
-    mass = 0.32; // yes, it's a magic number
-    tol = 1e-6;
-    tol_hq = 1e-6;
-    //niter = 500; // the staggered spectrum is rough
+    // We need to force a well-behaved operator + reasonable convergence, otherwise
+    // the staggered tests will fail. These checks are designed to be consistent
+    // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked"
+    if (!compute_fatlong) {
+      warningQuda("compute_fatlong = %d , expected value %d , overriding", compute_fatlong, true);
+      compute_fatlong = true;
+    }
+
+    double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-5 : 1e-6;
+    if (tol != expected_tol) {
+      warningQuda("tol = %e , expected value %e , overriding", tol, expected_tol);
+      tol = expected_tol;
+    }
+    if (tol_hq != expected_tol) {
+      warningQuda("tol_hq = %e , expected value %e , overriding", tol_hq, expected_tol);
+      tol_hq = 1e-5;
+    }
+
+    if (niter != 1000) {
+      warningQuda("niter = %d , expected value %d , overriding", niter, 1000);
+      compute_fatlong = 1000;
+    }
   }
 
   init();
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 4eee2b37ee..79c4de768d 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -77,7 +77,6 @@ bool support_solution_accumulator_pipeline(QudaInverterType type)
 bool skip_test(test_t param)
 {
   auto inverter_type = ::testing::get<0>(param);
-  auto solution_type = ::testing::get<1>(param);
   auto prec_sloppy = ::testing::get<3>(param);
   auto multishift = ::testing::get<4>(param);
   auto solution_accumulator_pipeline = ::testing::get<5>(param);
@@ -114,25 +113,40 @@ TEST_P(StaggeredInvertTest, verify)
   if (res_t & QUDA_L2_RELATIVE_RESIDUAL) inv_param.tol = tol;
   if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq;
 
-  inv_param.reliable_delta = reliable_delta;
+  auto inverter_type = ::testing::get<0>(param);
+  auto solution_type = ::testing::get<1>(param);
+  auto solve_type = ::testing::get<2>(param);
 
-  auto tol = inv_param.tol;
   // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this
-  if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50;
-  // Slight loss of precision possible when reconstructing full solution
-  if (is_full_solution(::testing::get<1>(GetParam())) && is_preconditioned_solve(::testing::get<2>(GetParam())))
-    tol *= 10;
-
-  // Slight loss of precision seems to be possible in single precision
-  // with the asqtad operator, though it looks like it's because of the
-  // fat/long links going through a few precision conversions here and there
-  if (dslash_type == QUDA_ASQTAD_DSLASH && prec <= QUDA_SINGLE_PRECISION)
+  // The mass squared is a proxy for the condition number
+  if (is_normal_residual(inverter_type)) tol /= (0.25 * mass * mass);
+
+  // To solve the direct operator to a given tolerance, grind the preconditioned
+  // operator to 0.5 * mass * tol... to keep the target tolerance in inv_param
+  // in check, we shift the requirement to the verified tolerance instead.
+  if (is_full_solution(solution_type) && is_preconditioned_solve(solve_type)) {
+    if (solve_type == QUDA_DIRECT_PC_SOLVE)
+      tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps
+    else if (solve_type == QUDA_NORMOP_PC_SOLVE)
+      tol /= (0.25 * mass * mass); // same as above, but squared as a proxy for the condition number
+  }
+
+  // The power iterations method of determining the Chebyshev window
+  // breaks down due to the nature of the spectrum of the direct operator
+  auto ca_basis_tmp = inv_param.ca_basis;
+  if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER)
+    inv_param.ca_basis = QUDA_POWER_BASIS;
+
+  // Slight loss of precision seems to be possible with the asqtad operator
+  if (dslash_type == QUDA_ASQTAD_DSLASH)
     tol *= 1.1;
 
   for (auto rsd : solve(GetParam())) {
     if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); }
     if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); }
   }
+
+  inv_param.ca_basis = ca_basis_tmp;
 }
 
 std::string gettestname(::testing::TestParamInfo<test_t> param)
@@ -167,14 +181,9 @@ auto staggered_pc_solvers
 
 auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER);
 
-// The spectrum of the staggered operator means MR has a miserable time converging,
-// it's not MR's fault. Other solvers have troubles too, which I need to think through.
-//auto direct_solvers
-//  = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER,
-//           QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
-
 auto direct_solvers
-  = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_BICGSTABL_INVERTER);
+  = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER,
+           QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
 
 auto sloppy_precisions
   = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION);
@@ -214,6 +223,16 @@ INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest,
                                  solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 
+// Heavy-Quark preconditioned solves
+INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest,
+                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
+                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1),
+                                 solution_accumulator_pipelines, no_schwarz,
+                                 Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)),
+                         gettestname);
+
+// These are left in but commented out for future reference
+
 // Schwarz-preconditioned normal solves
 //INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest,
 //                         Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION),
@@ -233,10 +252,3 @@ INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest,
 //                                 no_heavy_quark),
 //                         gettestname);
 
-// Heavy-Quark preconditioned solves
-INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest,
-                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
-                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1),
-                                 solution_accumulator_pipelines, no_schwarz,
-                                 Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)),
-                         gettestname);

From 550a5a91968f726e2e3b6bc48a8729d9a331d3cc Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 1 Dec 2023 10:48:10 -0800
Subject: [PATCH 16/53] Enabled split grid

---
 tests/CMakeLists.txt                  | 26 +++++++++++++-------------
 tests/invert_test_gtest.hpp           |  2 +-
 tests/staggered_invert_test.cpp       |  2 +-
 tests/staggered_invert_test_gtest.hpp |  2 +-
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3fea1f68b4..b26f49e529 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1148,19 +1148,19 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true
       --gtest_output=xml:invert_test_staggered_${prec}.xml)
 
-#      if(DEFINED ENV{QUDA_ENABLE_TUNING})
-#        if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-#          add_test(NAME invert_test_splitgrid_wilson_${prec}
-#            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-#            --dslash-type wilson --ngcrkrylov 8
-#            --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-#            --nsrc ${QUDA_TEST_NUM_PROCS}
-#            --enable-testing true
-#            --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml)
-#
-#          set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-#        endif()
-#      endif()
+      if(DEFINED ENV{QUDA_ENABLE_TUNING})
+        if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
+          add_test(NAME invert_test_splitgrid_staggered_${prec}
+            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+            --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
+            --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+            --nsrc ${QUDA_TEST_NUM_PROCS}
+            --enable-testing true
+            --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
+
+          set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+        endif()
+      endif()
 
     add_test(NAME invert_test_asqtad_${prec}
       COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index 74866dbcd7..27c9c873f1 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -117,7 +117,7 @@ bool skip_test(test_t param)
     return true;
 #endif
   }
-  // split-grid doesn't support split-grid at present
+  // split-grid doesn't support multishift at present
   if (use_split_grid && multishift > 1) return true;
 
   return false;
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 3299f0d84f..4ec84ba49d 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -247,7 +247,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
   // params related to split grid.
   for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
   int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
-  bool use_split_grid = num_sub_partition > 1;
+  use_split_grid = num_sub_partition > 1;
 
   // Setup the multigrid preconditioner
   void *mg_preconditioner = nullptr;
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 79c4de768d..243e0ffdc9 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -95,7 +95,7 @@ bool skip_test(test_t param)
   //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ)
   //  if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true;
 
-  // split-grid doesn't support split-grid at present
+  // split-grid doesn't support multigrid at present
   if (use_split_grid && multishift > 1) return true;
 
   return false;

From b2560b571779c45619ae8945e7644ba3f8af1b78 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 1 Dec 2023 11:11:28 -0800
Subject: [PATCH 17/53] Added info on how to run the old tests

---
 tests/staggered_invert_test.cpp | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 4ec84ba49d..fbd451adee 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -26,6 +26,9 @@ QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL];
 QudaEigParam eig_param;
 bool use_split_grid = false;
 
+// print instructions on how to run the old tests
+bool print_legacy_info = false;
+
 // if --enable-testing true is passed, we run the tests defined in here
 #include <staggered_invert_test_gtest.hpp>
 
@@ -111,6 +114,18 @@ void display_test_info()
              dimPartitioned(3));
 }
 
+void display_legacy_info()
+{
+  printfQuda("Instructions for running legacy tests:\n");
+  printfQuda("--test 0 -> --solve-type direct    --solution-type mat    --inv-type bicgstab\n");
+  printfQuda("--test 1 -> --solve-type direct-pc --solution-type mat    --inv-type cg --matpc even-even\n");
+  printfQuda("--test 2 -> --solve-type direct-pc --solution-type mat    --inv-type cg --matpc odd-odd\n");
+  printfQuda("--test 3 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even\n");
+  printfQuda("--test 4 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd\n");
+  printfQuda("--test 5 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even --multishift 8\n");
+  printfQuda("--test 6 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd   --multishift 8\n");
+}
+
 GaugeField cpuFatQDP = {};
 GaugeField cpuLongQDP = {};
 GaugeField cpuFatMILC = {};
@@ -416,6 +431,7 @@ int main(int argc, char **argv)
   add_multigrid_option_group(app);
   add_comms_option_group(app);
   add_testing_option_group(app);
+  app->add_option("--legacy-test-info", print_legacy_info, "Print info on how to reproduce the old '--test #' behavior with flags, then exit");
   try {
     app->parse(argc, argv);
   } catch (const CLI::ParseError &e) {
@@ -423,15 +439,20 @@ int main(int argc, char **argv)
   }
   setVerbosity(verbosity);
 
-  if (inv_deflate && inv_multigrid)
-    errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n");
-
   // Set values for precisions via the command line.
   setQudaPrecisions();
 
   // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
   initComms(argc, argv, gridsize_from_cmdline);
 
+  if (print_legacy_info) {
+    display_legacy_info();
+    errorQuda("Exiting...");
+  }
+
+  if (inv_deflate && inv_multigrid)
+    errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n");
+
   initRand();
 
   // Only these fermions are supported in this file

From e952379bcc1911a8910aae85a1f215762b41d1a7 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 1 Dec 2023 13:22:05 -0800
Subject: [PATCH 18/53] Added Laplace ctests, tweaked some tolerances,
 uncovered a BiCGStab issue

---
 tests/CMakeLists.txt                  | 22 +++++++++++++
 tests/staggered_dslash_ctest.cpp      | 11 +++++--
 tests/staggered_dslash_test.cpp       | 11 +++++--
 tests/staggered_eigensolve_test.cpp   | 11 +++++--
 tests/staggered_invert_test.cpp       | 13 +++++---
 tests/staggered_invert_test_gtest.hpp | 45 +++++++++++++++++++++++----
 tests/utils/host_utils.h              |  7 +++++
 7 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b26f49e529..3955de2cb1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -928,6 +928,19 @@ endif()
     if(polenv)
       set_tests_properties(dslash_${DIRAC_NAME}_build_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
     endif()
+
+    if(QUDA_LAPLACE)
+      set(DIRAC_NAME laplace)
+      add_test(NAME dslash_${DIRAC_NAME}_mat_policy${pol2}
+               COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                       --dslash-type ${DIRAC_NAME}
+                       --test Mat
+                       --dim 2 4 6 8
+                       --gtest_output=xml:dslash_${DIRAC_NAME}_mat_test_pol${pol2}.xml)
+      if(polenv)
+        set_tests_properties(dslash_${DIRAC_NAME}_mat_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
+      endif()
+    endif()
   endif()
 
   if(QUDA_COVDEV)
@@ -1168,6 +1181,15 @@ foreach(prec IN LISTS TEST_PRECS)
       --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
       --enable-testing true
       --gtest_output=xml:invert_test_asqtad_${prec}.xml)
+
+    if (QUDA_LAPLACE)
+      add_test(NAME invert_test_laplace_${prec}
+        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+        --dslash-type laplace --ngcrkrylov 8 --compute-fat-long true
+        --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+        --enable-testing true
+        --gtest_output=xml:invert_test_laplace_${prec}.xml)
+    endif()
   endif()
 endforeach(prec)
 
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index f17dfa278c..28a6a48141 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -122,8 +122,15 @@ int main(int argc, char **argv)
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
 
   // Only these fermions are supported in this file
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
-    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  if (is_laplace_enabled) {
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  } else {
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  }
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't
   // ask to build the fat/long links... it doesn't make sense.
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index 4ebed72ed8..2883d29b64 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -85,8 +85,15 @@ int main(int argc, char **argv)
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
 
   // Only these fermions are supported in this file
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
-    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  if (is_laplace_enabled) {
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  } else {
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  }
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash,
   // and don't ask to build the fat/long links... it doesn't make sense.
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index f0f2c3eec8..797dcb1311 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -79,8 +79,15 @@ int main(int argc, char **argv)
   setQudaPrecisions();
 
   // Only these fermions are supported in this file
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
-    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  if (is_laplace_enabled) {
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  } else {
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  }
 
   display_test_info();
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index fbd451adee..be0ef6fd6b 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -456,8 +456,15 @@ int main(int argc, char **argv)
   initRand();
 
   // Only these fermions are supported in this file
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
-    errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  if (is_laplace_enabled) {
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  } else {
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  }
 
   // Need to add support for LAPLACE MG?
   if (inv_multigrid) {
@@ -501,8 +508,6 @@ int main(int argc, char **argv)
   if (enable_testing) { // tests are defined in staggered_invert_test_gtest.hpp
     ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
     if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
-      errorQuda("Staggered ctest doesn't support the Laplace operator (yet)");
     result = RUN_ALL_TESTS();
   } else {
     solve(test_t {inv_type, solution_type, solve_type, prec_sloppy, multishift, solution_accumulator_pipeline,
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 243e0ffdc9..c5884c210a 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -16,6 +16,15 @@ class StaggeredInvertTest : public ::testing::TestWithParam<test_t>
   StaggeredInvertTest() : param(GetParam()) { }
 };
 
+bool is_hermitian_solver(QudaInverterType type)
+{
+  switch(type) {
+  case QUDA_CG_INVERTER:
+  case QUDA_CA_CG_INVERTER: return true;
+  default: return false;
+  }
+}
+
 bool is_normal_residual(QudaInverterType type)
 {
   switch (type) {
@@ -77,6 +86,8 @@ bool support_solution_accumulator_pipeline(QudaInverterType type)
 bool skip_test(test_t param)
 {
   auto inverter_type = ::testing::get<0>(param);
+  auto solution_type = ::testing::get<1>(param);
+  auto solve_type = ::testing::get<2>(param);
   auto prec_sloppy = ::testing::get<3>(param);
   auto multishift = ::testing::get<4>(param);
   auto solution_accumulator_pipeline = ::testing::get<5>(param);
@@ -95,6 +106,19 @@ bool skip_test(test_t param)
   //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ)
   //  if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true;
 
+  if (dslash_type == QUDA_LAPLACE_DSLASH) {
+    if (multishift > 1) return true; // Laplace doesn't support multishift
+    if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE) return true; // Laplace only supports direct solves
+  }
+
+  if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) {
+    // the staggered and asqtad operators aren't HPD
+    if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type)) return true;
+
+    // MR struggles with the staggered and asqtad spectrum, it's not MR's fault
+    if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) return true;
+  }
+
   // split-grid doesn't support multigrid at present
   if (use_split_grid && multishift > 1) return true;
 
@@ -107,6 +131,8 @@ TEST_P(StaggeredInvertTest, verify)
 {
   if (skip_test(GetParam())) GTEST_SKIP();
 
+  auto tol_backup = tol;
+
   inv_param.tol = 0.0;
   inv_param.tol_hq = 0.0;
   auto res_t = ::testing::get<7>(GetParam());
@@ -124,11 +150,13 @@ TEST_P(StaggeredInvertTest, verify)
   // To solve the direct operator to a given tolerance, grind the preconditioned
   // operator to 0.5 * mass * tol... to keep the target tolerance in inv_param
   // in check, we shift the requirement to the verified tolerance instead.
-  if (is_full_solution(solution_type) && is_preconditioned_solve(solve_type)) {
+  if (solution_type == QUDA_MAT_SOLUTION) {
     if (solve_type == QUDA_DIRECT_PC_SOLVE)
       tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps
-    else if (solve_type == QUDA_NORMOP_PC_SOLVE)
-      tol /= (0.25 * mass * mass); // same as above, but squared as a proxy for the condition number
+    if (solve_type == QUDA_NORMOP_SOLVE)
+      tol /= (0.5 * mass); // a proxy for the condition number
+  } else if (solution_type == QUDA_MATDAG_MAT_SOLUTION) {
+    tol *= 1.05; // seems to need a bit of a bump
   }
 
   // The power iterations method of determining the Chebyshev window
@@ -137,16 +165,21 @@ TEST_P(StaggeredInvertTest, verify)
   if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER)
     inv_param.ca_basis = QUDA_POWER_BASIS;
 
-  // Slight loss of precision seems to be possible with the asqtad operator
-  if (dslash_type == QUDA_ASQTAD_DSLASH)
+  // FIXME: there's an issue in mixed precision BiCGStab I need to squash.
+  if (inverter_type == QUDA_BICGSTAB_INVERTER)
     tol *= 1.1;
 
+  // CGNE needs a bit of a bump
+  if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER)
+    tol *= 1.05;
+
   for (auto rsd : solve(GetParam())) {
     if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); }
     if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); }
   }
 
   inv_param.ca_basis = ca_basis_tmp;
+  tol = tol_backup;
 }
 
 std::string gettestname(::testing::TestParamInfo<test_t> param)
@@ -182,7 +215,7 @@ auto staggered_pc_solvers
 auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER);
 
 auto direct_solvers
-  = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER,
+  = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER,
            QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
 
 auto sloppy_precisions
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 6dfdcfd856..f5276e26f1 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -40,6 +40,13 @@ extern QudaPrecision &cuda_prec_eigensolver;
 extern QudaPrecision &cuda_prec_refinement_sloppy;
 extern QudaPrecision &cuda_prec_ritz;
 
+// Determine if the Laplace operator has been defined
+#ifdef QUDA_LAPLACE
+constexpr bool is_laplace_enabled = true;
+#else
+constexpr bool is_laplace_enabled = false;
+#endif
+
 // Set some basic parameters via command line or use defaults
 // Implemented in set_params.cpp
 void setQudaStaggeredDefaultInvTestParams();

From afb41b609b3746d9f1dd1df11f65f2b27290e50a Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 1 Dec 2023 13:27:15 -0800
Subject: [PATCH 19/53] Quality of life BiCGStab readability changes

---
 lib/inv_bicgstab_quda.cpp | 117 +++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 64 deletions(-)

diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index 4fdf08020a..ecb0352095 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -76,7 +76,7 @@ namespace quda {
     ColorSpinorField *x_sloppy, *r_sloppy, *r_0;
 
     double b2 = blas::norm2(b); // norm sq of source
-    double r2;               // norm sq of residual
+    double r2;                  // norm sq of residual
 
     if (param.deflate) {
       // Construct the eigensolver and deflation space if requested.
@@ -134,7 +134,7 @@ namespace quda {
         x = b;
         param.true_res = 0.0;
         param.true_res_hq = 0.0;
-	profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
+        profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
         return;
       } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
         b2 = r2;
@@ -147,12 +147,9 @@ namespace quda {
     if (param.precision_sloppy == x.Precision()) {
       r_sloppy = &r;
 
-      if(param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO)
-      {
+      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO){
         r_0 = &b;
-      }
-      else
-      {
+      } else {
         ColorSpinorParam csParam(r);
         csParam.create = QUDA_ZERO_FIELD_CREATE;
         r_0 = new ColorSpinorField(csParam); // remember to delete this pointer.
@@ -168,13 +165,10 @@ namespace quda {
       *r_0 = r;
     }
 
-    if (param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator)
-    {
+    if (param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator) {
       x_sloppy = &x;
       blas::zero(*x_sloppy);
-    }
-    else
-    {
+    } else {
       ColorSpinorParam csParam(x);
       csParam.create = QUDA_ZERO_FIELD_CREATE;
       csParam.setPrecision(param.precision_sloppy);
@@ -222,20 +216,19 @@ namespace quda {
 
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
       printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n",
-		 blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p),
-		 blas::norm2(tmp), blas::norm2(r0), blas::norm2(t));
+                 blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p),
+                 blas::norm2(tmp), blas::norm2(r0), blas::norm2(t));
 
-    while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) &&
-	    k < param.maxiter) {
+    while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) {
 
       matSloppy(v, p);
 
       Complex r0v;
       if (param.pipeline) {
-	r0v = blas::cDotProduct(r0, v);
-	if (k>0) rho = blas::cDotProduct(r0, r);
+        r0v = blas::cDotProduct(r0, v);
+        if (k>0) rho = blas::cDotProduct(r0, r);
       } else {
-	r0v = blas::cDotProduct(r0, v);
+        r0v = blas::cDotProduct(r0, v);
       }
       if (abs(rho) == 0.0) alpha = 0.0;
       else alpha = rho / r0v;
@@ -247,38 +240,37 @@ namespace quda {
 
       int updateR = 0;
       if (param.pipeline) {
-	// omega = (t, r) / (t, t)
-	omega_t2 = blas::cDotProductNormA(t, rSloppy);
-	Complex tr = Complex(omega_t2.x, omega_t2.y);
-	double t2 = omega_t2.z;
-	omega = tr / t2;
-	double s2 = blas::norm2(rSloppy);
-	Complex r0t = blas::cDotProduct(r0, t);
-	beta = -r0t / r0v;
-	r2 = s2 - real(omega * conj(tr)) ;
-
-	// now we can work out if we need to do a reliable update
+        // omega = (t, r) / (t, t)
+        omega_t2 = blas::cDotProductNormA(t, rSloppy);
+        Complex tr = Complex(omega_t2.x, omega_t2.y);
+        double t2 = omega_t2.z;
+        omega = tr / t2;
+        double s2 = blas::norm2(rSloppy);
+        Complex r0t = blas::cDotProduct(r0, t);
+        beta = -r0t / r0v;
+        r2 = s2 - real(omega * conj(tr)) ;
+        // now we can work out if we need to do a reliable update
         updateR = reliable(rNorm, maxrx, maxrr, r2, delta);
       } else {
-	// omega = (t, r) / (t, t)
-	omega_t2 = blas::cDotProductNormA(t, rSloppy);
-	omega = Complex(omega_t2.x / omega_t2.z, omega_t2.y / omega_t2.z);
+        // omega = (t, r) / (t, t)
+        omega_t2 = blas::cDotProductNormA(t, rSloppy);
+        omega = Complex(omega_t2.x / omega_t2.z, omega_t2.y / omega_t2.z);
       }
 
       if (param.pipeline && !updateR) {
-	//x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p
-	blas::caxpbypzYmbw(alpha, p, omega, rSloppy, xSloppy, t);
-	blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p);
-	//tripleBiCGstabUpdate(alpha, p, omega, rSloppy, xSloppy, t, -beta*omega, v, beta, p
+        //x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p
+        blas::caxpbypzYmbw(alpha, p, omega, rSloppy, xSloppy, t);
+        blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p);
+        //tripleBiCGstabUpdate(alpha, p, omega, rSloppy, xSloppy, t, -beta*omega, v, beta, p
       } else {
-	//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
-	rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, rSloppy, xSloppy, t, r0);
-	rho0 = rho;
-	rho = Complex(rho_r2.x, rho_r2.y);
-	r2 = rho_r2.z;
+        //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
+        rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, rSloppy, xSloppy, t, r0);
+        rho0 = rho;
+        rho = Complex(rho_r2.x, rho_r2.y);
+        r2 = rho_r2.z;
       }
 
-      if (use_heavy_quark_res && k%heavy_quark_check==0) {
+      if (use_heavy_quark_res && k % heavy_quark_check==0) {
         if (&x != &xSloppy) {
            blas::copy(tmp,y);
            heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy).z);
@@ -291,9 +283,9 @@ namespace quda {
       if (!param.pipeline) updateR = reliable(rNorm, maxrx, maxrr, r2, delta);
 
       if (updateR) {
-	if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy);
+        if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy);
 
-	blas::xpy(x, y); // swap these around?
+        blas::xpy(x, y); // swap these around?
 
         mat(r, y);
         r2 = blas::xmyNorm(b, r);
@@ -307,31 +299,30 @@ namespace quda {
           r2 = blas::xmyNorm(b, r);
         }
 
-	if (x.Precision() != rSloppy.Precision()) blas::copy(rSloppy, r);
-	blas::zero(xSloppy);
+        if (x.Precision() != rSloppy.Precision()) blas::copy(rSloppy, r);
+        blas::zero(xSloppy);
 
-	rNorm = sqrt(r2);
-	maxrr = rNorm;
-	maxrx = rNorm;
-	//r0Norm = rNorm;
-	rUpdate++;
+        rNorm = sqrt(r2);
+        maxrr = rNorm;
+        maxrx = rNorm;
+        //r0Norm = rNorm;
+        rUpdate++;
       }
 
       k++;
 
       PrintStats("BiCGstab", k, r2, b2, heavy_quark_res);
       if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
-	printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n",
-		   blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p),
-		   blas::norm2(tmp), blas::norm2(r0), blas::norm2(t));
+        printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n",
+          blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p),
+          blas::norm2(tmp), blas::norm2(r0), blas::norm2(t));
 
       // update p
-      if (!param.pipeline || updateR) {// need to update if not pipeline or did a reliable update
-	if (abs(rho*alpha) == 0.0) beta = 0.0;
-	else beta = (rho/rho0) * (alpha/omega);
-	blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p);
+      if (!param.pipeline || updateR) { // need to update if not pipeline or did a reliable update
+        if (abs(rho*alpha) == 0.0) beta = 0.0;
+        else beta = (rho/rho0) * (alpha/omega);
+        blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p);
       }
-
     }
 
     if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy);
@@ -342,7 +333,7 @@ namespace quda {
 
     param.iter += k;
 
-    if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
+    if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
 
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("BiCGstab: Reliable updates = %d\n", rUpdate);
 
@@ -361,9 +352,7 @@ namespace quda {
     if (param.precision_sloppy != x.Precision()) {
       delete r_0;
       delete r_sloppy;
-    }
-    else if(param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES)
-    {
+    } else if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) {
       delete r_0;
     }
 

From 15eeb821ddc4c1d03d559a5c3d5bee9cea2c7a05 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 4 Dec 2023 14:47:08 -0800
Subject: [PATCH 20/53] Strong BiCGStab cleanup, still need to reconcile a host
 verification headache

---
 include/invert_quda.h                 |  11 +-
 lib/inv_bicgstab_quda.cpp             | 181 ++++++++++++++------------
 tests/staggered_invert_test_gtest.hpp |   9 +-
 3 files changed, 109 insertions(+), 92 deletions(-)

diff --git a/include/invert_quda.h b/include/invert_quda.h
index 11ac64708e..7cf26a6f4f 100644
--- a/include/invert_quda.h
+++ b/include/invert_quda.h
@@ -1048,8 +1048,15 @@ namespace quda {
 
   private:
     const DiracMdagM matMdagM; // used by the eigensolver
-    // pointers to fields to avoid multiple creation overhead
-    ColorSpinorField *yp, *rp, *pp, *vp, *tmpp, *tp;
+
+    ColorSpinorField y; // Full precision solution accumulator
+    ColorSpinorField r; // Full precision residual vector
+    ColorSpinorField p; // Sloppy precision search direction
+    ColorSpinorField v; // Sloppy precision A * p
+    ColorSpinorField t; // Sloppy precision vector used for minres step
+    ColorSpinorField r0; // Bi-orthogonalization vector
+    ColorSpinorField r_sloppy; // Slopy precision residual vector
+    ColorSpinorField x_sloppy; // Sloppy solution accumulator vector
     bool init = false;
 
   public:
diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index ecb0352095..10ec609ec3 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -21,15 +21,6 @@ namespace quda {
 
   BiCGstab::~BiCGstab() {
     profile.TPSTART(QUDA_PROFILE_FREE);
-
-    if(init) {
-      delete yp;
-      delete rp;
-      delete pp;
-      delete vp;
-      delete tmpp;
-      delete tp;
-    }
     destroyDeflationSpace();
     profile.TPSTOP(QUDA_PROFILE_FREE);
   }
@@ -55,26 +46,16 @@ namespace quda {
     if (!init) {
       ColorSpinorParam csParam(x);
       csParam.create = QUDA_ZERO_FIELD_CREATE;
-      yp = new ColorSpinorField(csParam);
-      rp = new ColorSpinorField(csParam);
+      y = ColorSpinorField(csParam);
+      r = ColorSpinorField(csParam);
       csParam.setPrecision(param.precision_sloppy);
-      pp = new ColorSpinorField(csParam);
-      vp = new ColorSpinorField(csParam);
-      tmpp = new ColorSpinorField(csParam);
-      tp = new ColorSpinorField(csParam);
+      p = ColorSpinorField(csParam);
+      v = ColorSpinorField(csParam);
+      t = ColorSpinorField(csParam);
 
       init = true;
     }
 
-    ColorSpinorField &y = *yp;
-    ColorSpinorField &r = *rp;
-    ColorSpinorField &p = *pp;
-    ColorSpinorField &v = *vp;
-    ColorSpinorField &tmp = *tmpp;
-    ColorSpinorField &t = *tp;
-
-    ColorSpinorField *x_sloppy, *r_sloppy, *r_0;
-
     double b2 = blas::norm2(b); // norm sq of source
     double r2;                  // norm sq of residual
 
@@ -145,41 +126,36 @@ namespace quda {
 
     // set field aliasing according to whether we are doing mixed precision or not
     if (param.precision_sloppy == x.Precision()) {
-      r_sloppy = &r;
+      r_sloppy = r.create_alias();
 
-      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO){
-        r_0 = &b;
+      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) {
+        r0 = b.create_alias();
       } else {
         ColorSpinorParam csParam(r);
-        csParam.create = QUDA_ZERO_FIELD_CREATE;
-        r_0 = new ColorSpinorField(csParam); // remember to delete this pointer.
-        *r_0 = r;
+        csParam.create = QUDA_NULL_FIELD_CREATE;
+        r0 = ColorSpinorField(csParam);
+        blas::copy(r0, r);
       }
     } else {
       ColorSpinorParam csParam(x);
       csParam.setPrecision(param.precision_sloppy);
       csParam.create = QUDA_NULL_FIELD_CREATE;
-      r_sloppy = new ColorSpinorField(csParam);
-      *r_sloppy = r;
-      r_0 = new ColorSpinorField(csParam);
-      *r_0 = r;
+      r_sloppy = ColorSpinorField(csParam);
+      blas::copy(r_sloppy, r);
+      r0 = ColorSpinorField(csParam);
+      blas::copy(r0, r);
     }
 
     if (param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator) {
-      x_sloppy = &x;
-      blas::zero(*x_sloppy);
+      x_sloppy = x.create_alias();
+      blas::zero(x_sloppy);
     } else {
       ColorSpinorParam csParam(x);
       csParam.create = QUDA_ZERO_FIELD_CREATE;
       csParam.setPrecision(param.precision_sloppy);
-      x_sloppy = new ColorSpinorField(csParam);
+      x_sloppy = ColorSpinorField(csParam);
     }
 
-    // Syntatic sugar
-    ColorSpinorField &rSloppy = *r_sloppy;
-    ColorSpinorField &xSloppy = *x_sloppy;
-    ColorSpinorField &r0 = *r_0;
-
     double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver
 
     const bool use_heavy_quark_res =
@@ -212,21 +188,27 @@ namespace quda {
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
     rho = r2; // cDotProductCuda(r0, r_sloppy); // BiCRstab
-    blas::copy(p, rSloppy);
+    blas::copy(p, r_sloppy);
+
+    bool converged = convergence(r2, heavy_quark_res, stop, param.tol_hq);
 
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
-      printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n",
-                 blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p),
-                 blas::norm2(tmp), blas::norm2(r0), blas::norm2(t));
+      printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n",
+                 blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p),
+                 blas::norm2(r0), blas::norm2(t));
 
-    while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) {
+    // track if we just performed an exact recalculation of y, r, r2
+    bool just_updated = false;
+
+    while ( !converged && k < param.maxiter) {
+      just_updated = false;
 
       matSloppy(v, p);
 
       Complex r0v;
       if (param.pipeline) {
         r0v = blas::cDotProduct(r0, v);
-        if (k>0) rho = blas::cDotProduct(r0, r);
+        if (k > 0) rho = blas::cDotProduct(r0, r);
       } else {
         r0v = blas::cDotProduct(r0, v);
       }
@@ -234,18 +216,18 @@ namespace quda {
       else alpha = rho / r0v;
 
       // r -= alpha*v
-      blas::caxpy(-alpha, v, rSloppy);
+      blas::caxpy(-alpha, v, r_sloppy);
 
-      matSloppy(t, rSloppy);
+      matSloppy(t, r_sloppy);
 
       int updateR = 0;
       if (param.pipeline) {
         // omega = (t, r) / (t, t)
-        omega_t2 = blas::cDotProductNormA(t, rSloppy);
+        omega_t2 = blas::cDotProductNormA(t, r_sloppy);
         Complex tr = Complex(omega_t2.x, omega_t2.y);
         double t2 = omega_t2.z;
         omega = tr / t2;
-        double s2 = blas::norm2(rSloppy);
+        double s2 = blas::norm2(r_sloppy);
         Complex r0t = blas::cDotProduct(r0, t);
         beta = -r0t / r0v;
         r2 = s2 - real(omega * conj(tr)) ;
@@ -253,29 +235,28 @@ namespace quda {
         updateR = reliable(rNorm, maxrx, maxrr, r2, delta);
       } else {
         // omega = (t, r) / (t, t)
-        omega_t2 = blas::cDotProductNormA(t, rSloppy);
+        omega_t2 = blas::cDotProductNormA(t, r_sloppy);
         omega = Complex(omega_t2.x / omega_t2.z, omega_t2.y / omega_t2.z);
       }
 
       if (param.pipeline && !updateR) {
         //x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p
-        blas::caxpbypzYmbw(alpha, p, omega, rSloppy, xSloppy, t);
-        blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p);
-        //tripleBiCGstabUpdate(alpha, p, omega, rSloppy, xSloppy, t, -beta*omega, v, beta, p
+        blas::caxpbypzYmbw(alpha, p, omega, r_sloppy, x_sloppy, t);
+        blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p);
+        //tripleBiCGstabUpdate(alpha, p, omega, r_sloppy, x_sloppy, t, -beta*omega, v, beta, p
       } else {
         //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
-        rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, rSloppy, xSloppy, t, r0);
+        rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, r_sloppy, x_sloppy, t, r0);
         rho0 = rho;
         rho = Complex(rho_r2.x, rho_r2.y);
         r2 = rho_r2.z;
       }
 
       if (use_heavy_quark_res && k % heavy_quark_check==0) {
-        if (&x != &xSloppy) {
-           blas::copy(tmp,y);
-           heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy).z);
+        if (&x != &x_sloppy) {
+           heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x_sloppy, r_sloppy).z);
         } else {
-           blas::copy(r, rSloppy);
+           blas::copy(r, r_sloppy);
            heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z);
         }
       }
@@ -283,9 +264,9 @@ namespace quda {
       if (!param.pipeline) updateR = reliable(rNorm, maxrx, maxrr, r2, delta);
 
       if (updateR) {
-        if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy);
+        if (x.Precision() != x_sloppy.Precision()) blas::copy(x, x_sloppy);
 
-        blas::xpy(x, y); // swap these around?
+        blas::xpy(x, y);
 
         mat(r, y);
         r2 = blas::xmyNorm(b, r);
@@ -299,34 +280,74 @@ namespace quda {
           r2 = blas::xmyNorm(b, r);
         }
 
-        if (x.Precision() != rSloppy.Precision()) blas::copy(rSloppy, r);
-        blas::zero(xSloppy);
+        if (x.Precision() != r_sloppy.Precision()) blas::copy(r_sloppy, r);
+        blas::zero(x_sloppy);
 
         rNorm = sqrt(r2);
         maxrr = rNorm;
         maxrx = rNorm;
         //r0Norm = rNorm;
         rUpdate++;
+
+        just_updated = true;
       }
 
       k++;
 
       PrintStats("BiCGstab", k, r2, b2, heavy_quark_res);
       if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
-        printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n",
-          blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p),
-          blas::norm2(tmp), blas::norm2(r0), blas::norm2(t));
+        printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n",
+          blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p),
+          blas::norm2(r0), blas::norm2(t));
+
+      converged = convergence(r2, heavy_quark_res, stop, param.tol_hq);
+
+      if (converged) {
+        // make sure we've truly converged
+        if (!just_updated) {
+          if (x.Precision() != x_sloppy.Precision()) blas::copy(x, x_sloppy);
+          blas::xpy(x, y);
+          mat(r, y);
+          r2 = blas::xmyNorm(b, r);
+
+          if (param.deflate && sqrt(r2) < param.tol_restart) {
+            // Deflate and accumulate to solution vector
+            eig_solve->deflate(y, r, evecs, evals, true);
+            // Compute r_defl = RHS - A * LHS
+            mat(r, y);
+            r2 = blas::xmyNorm(b, r);
+          }
+
+          if (x.Precision() != r_sloppy.Precision()) blas::copy(r_sloppy, r);
+          blas::zero(x_sloppy);
+
+          rNorm = sqrt(r2);
+          maxrr = rNorm;
+          maxrx = rNorm;
+          //r0Norm = rNorm;
+          rUpdate++;
+
+          just_updated = true;
+        }
+
+        // explicitly compute the HQ residual if need be
+        heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(y, r).z) : 0.0;
+
+        // Update convergence check
+        converged = convergence(r2, heavy_quark_res, stop, param.tol_hq);
+      }
 
       // update p
-      if (!param.pipeline || updateR) { // need to update if not pipeline or did a reliable update
+      if ((!param.pipeline || updateR) && !converged) { // need to update if not pipeline or did a reliable update
         if (abs(rho*alpha) == 0.0) beta = 0.0;
         else beta = (rho/rho0) * (alpha/omega);
-        blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p);
+        blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p);
       }
     }
 
-    if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy);
-    blas::xpy(y, x);
+    // We have a guarantee that we just converged via the true residual
+    // y has already been updated
+    blas::copy(x, y);
 
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
@@ -338,9 +359,8 @@ namespace quda {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("BiCGstab: Reliable updates = %d\n", rUpdate);
 
     if (!param.is_preconditioner) { // do not do the below if we this is an inner solver
-      // Calculate the true residual
-      mat(r, x);
-      param.true_res = sqrt(blas::xmyNorm(b, r) / b2);
+      // r2 was freshly computed
+      param.true_res = sqrt(r2 / b2);
       param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r).z) : 0.0;
 
       PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq);
@@ -348,17 +368,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-    profile.TPSTART(QUDA_PROFILE_FREE);
-    if (param.precision_sloppy != x.Precision()) {
-      delete r_0;
-      delete r_sloppy;
-    } else if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) {
-      delete r_0;
-    }
-
-    if (&x != &xSloppy) delete x_sloppy;
-
-    profile.TPSTOP(QUDA_PROFILE_FREE);
   }
 
 } // namespace quda
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index c5884c210a..675086e699 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -166,11 +166,12 @@ TEST_P(StaggeredInvertTest, verify)
     inv_param.ca_basis = QUDA_POWER_BASIS;
 
   // FIXME: there's an issue in mixed precision BiCGStab I need to squash.
-  if (inverter_type == QUDA_BICGSTAB_INVERTER)
-    tol *= 1.1;
+  //if (inverter_type == QUDA_BICGSTAB_INVERTER)
+  //  tol *= 1.1;
 
-  // CGNE needs a bit of a bump
-  if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER)
+  // CGNE and ASQTAD need a bit of a bump
+  if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER
+      || dslash_type == QUDA_ASQTAD_DSLASH)
     tol *= 1.05;
 
   for (auto rsd : solve(GetParam())) {

From fc65b73eeb26ceef3a6ef4f4043d8894d987b98c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 4 Dec 2023 17:27:55 -0800
Subject: [PATCH 21/53] Various misc cleanup

---
 tests/host_reference/dslash_reference.cpp |  2 +-
 tests/staggered_dslash_test_utils.h       | 33 ++++++++---------------
 tests/staggered_invert_test.cpp           |  7 ++---
 3 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index c2db9993f8..19c9616288 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -752,7 +752,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
   if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
     stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
 
-    // exact reason for this tbd, this isn't needed in the dslash test...
+    // correct for the massRescale function inside invertQuda
     if (dslash_type == QUDA_LAPLACE_DSLASH)
       ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
   } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 246dcdfea4..39bdc09c7b 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -66,8 +66,8 @@ struct StaggeredDslashTestWrapper {
   static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
   static inline void *milc_fatlink = nullptr;
   static inline void *milc_longlink = nullptr;
-  static inline GaugeField *cpuFat = nullptr;
-  static inline GaugeField *cpuLong = nullptr;
+  static inline GaugeField cpuFat;
+  static inline GaugeField cpuLong;
 
   QudaParity parity = QUDA_EVEN_PARITY;
 
@@ -77,26 +77,23 @@ struct StaggeredDslashTestWrapper {
   static inline bool test_split_grid = false;
   int num_src = 1;
 
-  // Whether or not we need the ghost zones
-  bool need_ghost_zone = false;
-
   void staggeredDslashRef()
   {
     // compare to dslash reference implementation
     printfQuda("Calculating reference implementation...");
     switch (dtest_type) {
     case dslash_test_type::Dslash:
-      stag_dslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type);
+      stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type);
       break;
     case dslash_test_type::MatPC:
-      stag_matpc(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type);
+      stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type);
       break;
     case dslash_test_type::Mat:
-      stag_mat(spinorRef, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type);
+      stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
       break;
     case dslash_test_type::MatDagMat:
-      stag_mat(tmpCpu, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type);
-      stag_mat(spinorRef, *cpuFat, *cpuLong, tmpCpu, mass, 1 - dagger, dslash_type);
+      stag_mat(tmpCpu, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
+      stag_mat(spinorRef, cpuFat, cpuLong, tmpCpu, mass, 1 - dagger, dslash_type);
       break;
     default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
     }
@@ -239,13 +236,13 @@ struct StaggeredDslashTestWrapper {
     GaugeFieldParam cpuFatParam(gauge_param, qdp_fatlink);
     cpuFatParam.order = QUDA_QDP_GAUGE_ORDER;
     cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-    cpuFat = GaugeField::Create(cpuFatParam);
+    cpuFat = GaugeField(cpuFatParam);
 
     gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
     GaugeFieldParam cpuLongParam(gauge_param, qdp_longlink);
     cpuLongParam.order = QUDA_QDP_GAUGE_ORDER;
     cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-    cpuLong = GaugeField::Create(cpuLongParam);
+    cpuLong = GaugeField(cpuLongParam);
 
     // Override link reconstruct as appropriate for staggered or asqtad
     if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) {
@@ -283,17 +280,9 @@ struct StaggeredDslashTestWrapper {
       delete dirac;
       dirac = nullptr;
     }
-
     freeGaugeQuda();
-
-    if (cpuFat) {
-      delete cpuFat;
-      cpuFat = nullptr;
-    }
-    if (cpuLong) {
-      delete cpuLong;
-      cpuLong = nullptr;
-    }
+    cpuFat = {};
+    cpuLong = {};
     commDimPartitionedReset();
   }
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index be0ef6fd6b..d8775e909f 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -172,7 +172,6 @@ void init()
   }
 
   setDims(gauge_param.X);
-  // Hack: use the domain wall dimensions so we may use the 5th dim for multi indexing
   dw_setDims(gauge_param.X, 1);
 
   // Staggered Gauge construct START
@@ -185,9 +184,9 @@ void init()
   gauge_param.location = QUDA_CPU_FIELD_LOCATION;
 
   GaugeFieldParam cpuParam(gauge_param);
-  cpuParam.create = QUDA_NULL_FIELD_CREATE;
-  cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuParam.order = QUDA_QDP_GAUGE_ORDER;
+  cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
+  cpuParam.create = QUDA_NULL_FIELD_CREATE;
   GaugeField cpuIn = GaugeField(cpuParam);
   cpuFatQDP = GaugeField(cpuParam);
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
@@ -222,6 +221,8 @@ void init()
     printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
   }
 
+  freeGaugeQuda();
+
   loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param);
 
   // now copy back to QDP aliases, since these are used for the reference dslash

From 7c5c2c5600eecd547d89b9800e598eafc4184a96 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 5 Dec 2023 16:01:45 -0800
Subject: [PATCH 22/53] Fixed a verify issue for full parity solves

---
 tests/host_reference/dslash_reference.cpp |  2 +-
 tests/staggered_invert_test_gtest.hpp     | 13 +++----------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 19c9616288..d92a4cf97d 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -771,7 +771,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
   }
 
   int len = 0;
-  if (solution_type == QUDA_MAT_SOLUTION || solution_type == QUDA_MATDAG_MAT_SOLUTION) {
+  if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
     len = V;
   } else {
     len = Vh;
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 675086e699..18c290b82c 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -155,8 +155,6 @@ TEST_P(StaggeredInvertTest, verify)
       tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps
     if (solve_type == QUDA_NORMOP_SOLVE)
       tol /= (0.5 * mass); // a proxy for the condition number
-  } else if (solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-    tol *= 1.05; // seems to need a bit of a bump
   }
 
   // The power iterations method of determining the Chebyshev window
@@ -165,14 +163,9 @@ TEST_P(StaggeredInvertTest, verify)
   if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER)
     inv_param.ca_basis = QUDA_POWER_BASIS;
 
-  // FIXME: there's an issue in mixed precision BiCGStab I need to squash.
-  //if (inverter_type == QUDA_BICGSTAB_INVERTER)
-  //  tol *= 1.1;
-
-  // CGNE and ASQTAD need a bit of a bump
-  if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER
-      || dslash_type == QUDA_ASQTAD_DSLASH)
-    tol *= 1.05;
+  // Single precision needs a tiny bump
+  if (prec == QUDA_SINGLE_PRECISION)
+    tol *= 1.01;
 
   for (auto rsd : solve(GetParam())) {
     if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); }

From b66fc76baed0846483ff17d95851a5cb5ff8a5fe Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 6 Dec 2023 12:06:29 -0800
Subject: [PATCH 23/53] Various staggered_invert_test cleanup, made it look
 more like invert_test.

---
 tests/staggered_invert_test.cpp       | 180 ++++++++++++++------------
 tests/staggered_invert_test_gtest.hpp |   2 +-
 2 files changed, 100 insertions(+), 82 deletions(-)

diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index d8775e909f..5230300779 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -279,77 +279,30 @@ std::vector<std::array<double, 2>> solve(test_t param)
   //-----------------------------------------------------------------------------------
   std::vector<quda::ColorSpinorField> in(Nsrc);
   std::vector<quda::ColorSpinorField> out(Nsrc);
+  std::vector<quda::ColorSpinorField> out_multishift(Nsrc * multishift);
+  quda::ColorSpinorField ref;
+  quda::ColorSpinorField tmp;
   quda::ColorSpinorParam cs_param;
   constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param);
-  for (int k = 0; k < Nsrc; k++) {
-    in[k] = quda::ColorSpinorField(cs_param);
-    out[k] = quda::ColorSpinorField(cs_param);
-  }
-  ColorSpinorField ref(cs_param);
-  ColorSpinorField tmp(cs_param);
+  ref = quda::ColorSpinorField(cs_param);
+  tmp = quda::ColorSpinorField(cs_param);
+  std::vector<std::vector<void *>> _hp_multi_x(Nsrc, std::vector<void*>(multishift));
+
   // Staggered vector construct END
   //-----------------------------------------------------------------------------------
 
-  // Prepare rng
-  quda::RNG rng(ref, 1234);
-
-  // Performance measuring
-  std::vector<double> time(Nsrc);
-  std::vector<double> gflops(Nsrc);
-  std::vector<int> iter(Nsrc);
-
-  // Populate `in` with random noise
-  for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM); }
-
-  // QUDA invert test
-  //----------------------------------------------------------------------------
-
-  std::vector<std::array<double, 2>> res(Nsrc);
+  // Setup multishift parameters (if needed)
+  //---------------------------------------------------------------------------
 
-  if (multishift == 1) {
-    if (!use_split_grid) {
-      for (int k = 0; k < Nsrc; k++) {
-        if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
-        invertQuda(out[k].data(), in[k].data(), &inv_param);
-        time[k] = inv_param.secs;
-        gflops[k] = inv_param.gflops / inv_param.secs;
-        iter[k] = inv_param.iter;
-        printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs,
-                   inv_param.gflops / inv_param.secs);
-      }
-    } else {
-      std::vector<void *> _hp_x(Nsrc);
-      std::vector<void *> _hp_b(Nsrc);
-      for (int k = 0; k < Nsrc; k++) {
-        _hp_x[k] = out[k].data();
-        _hp_b[k] = in[k].data();
-      }
-      inv_param.num_src = Nsrc;
-      inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition;
-      invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(), cpuLongMILC.data(),
-                                  &gauge_param);
-      quda::comm_allreduce_int(inv_param.iter);
-      inv_param.iter /= comm_size() / num_sub_partition;
-      quda::comm_allreduce_sum(inv_param.gflops);
-      inv_param.gflops /= comm_size() / num_sub_partition;
-      quda::comm_allreduce_max(inv_param.secs);
-      printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter,
-                 inv_param.secs, inv_param.gflops / inv_param.secs);
-    }
+  // Masses
+  std::vector<double> masses(multishift);
 
-    for (int k = 0; k < Nsrc; k++) {
-      if (verify_results)
-        res[k] = verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, inv_param, 0);
-    }
-  } else if (multishift > 1) {
+  if (multishift > 1) {
     if (use_split_grid)
       errorQuda("Multishift currently doesn't support split grid.\n");
 
     inv_param.num_offset = multishift;
 
-    // Prepare vectors for masses
-    std::vector<double> masses(multishift);
-
     // Consistency check for masses, tols, tols_hq size if we're setting custom values
     if (multishift_shifts.size() != 0)
       errorQuda("Multishift shifts are not supported for Wilson-type fermions");
@@ -360,47 +313,112 @@ std::vector<std::array<double, 2>> solve(test_t param)
     if (multishift_tols_hq.size() != 0 && multishift_tols_hq.size() != static_cast<unsigned long>(multishift))
       errorQuda("Multishift hq tolerance count %d does not agree with number of masses passed in %lu\n", multishift, multishift_tols_hq.size());
 
-    // Allocate storage of output arrays
-    std::vector<void*> outArray(multishift);
-    std::vector<ColorSpinorField> qudaOutArray(multishift, cs_param);
-
-    // Copy offsets and tolerances into inv_param; copy data pointers into outArray
+    // Copy offsets and tolerances into inv_param; allocate and copy data pointers
     for (int i = 0; i < multishift; i++) {
       masses[i] = (multishift_masses.size() == 0 ? (mass + i * i * 0.01) : multishift_masses[i]);
       inv_param.offset[i] = 4 * masses[i] * masses[i];
       inv_param.tol_offset[i] = (multishift_tols.size() == 0 ? inv_param.tol : multishift_tols[i]);
       inv_param.tol_hq_offset[i] = (multishift_tols_hq.size() == 0 ? inv_param.tol_hq : multishift_tols_hq[i]);
 
-      outArray[i] = qudaOutArray[i].data();
+      // Allocate memory and set pointers
+      for (int n = 0; n < Nsrc; n++) {
+        out_multishift[n * multishift + i] = quda::ColorSpinorField(cs_param);
+        _hp_multi_x[n][i] = out_multishift[n * multishift + i].data();
+      }
 
       logQuda(QUDA_VERBOSE, "Multishift mass %d = %e ; tolerance %e ; hq tolerance %e\n", i, masses[i], inv_param.tol_offset[i], inv_param.tol_hq_offset[i]);
     }
+  }
 
-    for (int k = 0; k < Nsrc; k++) {
-      quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM);
-      invertMultiShiftQuda((void **)outArray.data(), in[k].data(), &inv_param);
+  // Setup multishift parameters END
+  //-----------------------------------------------------------------------------------
 
-      time[k] = inv_param.secs;
-      gflops[k] = inv_param.gflops / inv_param.secs;
-      iter[k] = inv_param.iter;
-      printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs,
-                 inv_param.gflops / inv_param.secs);
+  // Prepare rng, fill host spinors with random numbers
+  //-----------------------------------------------------------------------------------
+
+  std::vector<double> time(Nsrc);
+  std::vector<double> gflops(Nsrc);
+  std::vector<int> iter(Nsrc);
 
+  quda::RNG rng(ref, 1234);
+
+  for (int n = 0; n < Nsrc; n++) {
+    // Populate the host spinor with random numbers.
+    in[n] = quda::ColorSpinorField(cs_param);
+    quda::spinorNoise(in[n], rng, QUDA_NOISE_UNIFORM);
+    out[n] = quda::ColorSpinorField(cs_param);
+  }
 
-      for (int i = 0; i < multishift; i++) {
-        printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        auto resid = verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i);
+  // Prepare rng, fill host spinors with random numbers END
+  //-----------------------------------------------------------------------------------
 
-        // take the HQ residual from the lightest mass
-        if (i == 0) {
-          res[k] = resid;
+  // QUDA invert test
+  //----------------------------------------------------------------------------
+
+  std::vector<std::array<double, 2>> res(Nsrc);
+
+  if (!use_split_grid) {
+
+    for (int n = 0; n < Nsrc; n++) {
+      // If deflating, preserve the deflation space between solves
+      if (inv_deflate) eig_param.preserve_deflation = n < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
+      // Perform QUDA inversions
+      if (multishift > 1) {
+        invertMultiShiftQuda(_hp_multi_x[n].data(), in[n].data(), &inv_param);
+      } else {
+        invertQuda(out[n].data(), in[n].data(), &inv_param);
+      }
+
+      time[n] = inv_param.secs;
+      gflops[n] = inv_param.gflops / inv_param.secs;
+      iter[n] = inv_param.iter;
+      printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs,
+                  inv_param.gflops / inv_param.secs);
+
+      if (verify_results) {
+        if (multishift > 1) {
+          for (int i = 0; i < multishift; i++) {
+            printfQuda("%dth solution: mass=%f, ", i, masses[i]);
+            auto resid = verifyStaggeredInversion(tmp, ref, in[n], out_multishift[n * multishift + i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i);
+
+            // take the HQ residual from the lightest mass
+            if (i == 0) {
+              res[n] = resid;
+            } else {
+                if (resid[0] > res[n][0]) res[n][0] = resid[0];
+            }
+          }
         } else {
-          if (resid[0] > res[k][0]) res[k][0] = resid[0];
+          res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0);
         }
       }
     }
   } else {
-    errorQuda("Invalid number of shifts %d", multishift);
+    inv_param.num_src = Nsrc;
+    inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition;
+    // Host arrays for solutions, sources, and check
+    std::vector<void *> _hp_x(Nsrc);
+    std::vector<void *> _hp_b(Nsrc);
+    for (int n = 0; n < Nsrc; n++) {
+      _hp_x[n] = out[n].data();
+      _hp_b[n] = in[n].data();
+    }
+    // Run split grid
+    invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(), cpuLongMILC.data(),
+                                &gauge_param);
+
+    quda::comm_allreduce_int(inv_param.iter);
+    inv_param.iter /= comm_size() / num_sub_partition;
+    quda::comm_allreduce_sum(inv_param.gflops);
+    inv_param.gflops /= comm_size() / num_sub_partition;
+    quda::comm_allreduce_max(inv_param.secs);
+    printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter,
+                inv_param.secs, inv_param.gflops / inv_param.secs);
+
+    for (int n = 0; n < Nsrc; n++) {
+      if (verify_results)
+        res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0);
+    }
   }
 
   // Free the multigrid solver
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 18c290b82c..4617c1e85d 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -233,7 +233,7 @@ INSTANTIATE_TEST_SUITE_P(EvenOdd, StaggeredInvertTest,
 
 // full system normal solve
 INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredInvertTest,
-                         Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE),
+                         Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE),
                                  sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 

From a5b89ebedbf7b254d34bc0889d6cb138e8b5167c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 6 Dec 2023 13:23:00 -0800
Subject: [PATCH 24/53] Updated verifyStaggeredInversion to look like the
 regular verifyInversion routine

---
 tests/host_reference/dslash_reference.cpp | 115 +++++++++++++++-------
 tests/host_reference/dslash_reference.h   |   8 +-
 tests/staggered_invert_test.cpp           |  40 +++-----
 3 files changed, 98 insertions(+), 65 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index d92a4cf97d..b2688b0094 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -744,59 +744,98 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 }
 
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                QudaInvertParam &inv_param, int shift)
+                                quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                QudaInvertParam &inv_param) {
+  std::vector<quda::ColorSpinorField> out_vector(1);
+  out_vector[0] = out;
+  return verifyStaggeredInversion(tmp, ref, in, out_vector, fat_link,
+                                  long_link, inv_param);
+}
+
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
+                                std::vector<quda::ColorSpinorField> &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                QudaInvertParam &inv_param)
 {
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
+  double l2r_max = 0.0;
+  double hqr_max = 0.0;
+  if (multishift > 1) {
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      errorQuda("Multishift solves do not support the laplace operator (yet)");
 
-  if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
-    stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
+    if (inv_param.solution_type != QUDA_MATPC_SOLUTION)
+      errorQuda("Invalid staggered multishift solution type %d, expected QUDA_MATPC_SOLUTION", inv_param.solution_type);
 
-    // correct for the massRescale function inside invertQuda
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
-      ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
-  } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
+    // Check the mat_pc type and make sure it's sane
     QudaParity parity = QUDA_INVALID_PARITY;
     switch (inv_param.matpc_type) {
       case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
       case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
       default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
     }
-    stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type);
-  } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-    stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type);
-    stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
-  } else {
-    errorQuda("Invalid staggered solution type %d", inv_param.solution_type);
-  }
 
-  int len = 0;
-  if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-    len = V;
-  } else {
-    len = Vh;
-  }
+    for (int i = 0; i < multishift; i++) {
+      auto& out = out_vector[i];
+      double mass = 0.5 * sqrt(inv_param.offset[i]);
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type);
 
-  mxpy(in.data(), ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
-  double nrm2 = norm_2(ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
-  double src2 = norm_2(in.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
-  double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z);
-  double l2r = sqrt(nrm2 / src2);
+      mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+      double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+      double src2 = norm_2(in.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+      double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z);
+      double l2r = sqrt(nrm2 / src2);
+
+      printfQuda("%dth solution: mass=%f, ", i, mass);
+      printfQuda("Shift %2d residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, "
+                "QUDA = %9.6e, host = %9.6e\n",
+                i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r,
+                inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr);
+      // Empirical: if the cpu residue is more than 1 order the target accuracy, then it fails to converge
+      if (sqrt(nrm2 / src2) > 10 * inv_param.tol_offset[i]) {
+        printfQuda("Shift %2d has empirically failed to converge\n", i);
+      }
+
+      l2r_max = std::max(l2r_max, l2r);
+      hqr_max = std::max(hqr_max, hqr);
+    }
 
-  if (multishift == 1) {
-    printfQuda("Residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, QUDA = %9.6e, "
-               "host = %9.6e\n",
-               inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);
   } else {
-    printfQuda("Shift %2d residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, "
-               "QUDA = %9.6e, host = %9.6e\n",
-               shift, inv_param.tol_offset[shift], inv_param.true_res_offset[shift], l2r,
-               inv_param.tol_hq_offset[shift], inv_param.true_res_hq_offset[shift], hqr);
-    // Empirical: if the cpu residue is more than 1 order the target accuracy, then it fails to converge
-    if (sqrt(nrm2 / src2) > 10 * inv_param.tol_offset[shift]) {
-      printfQuda("Shift %2d has empirically failed to converge\n", shift);
+    auto& out = out_vector[0];
+    double mass = inv_param.mass;
+    if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
+      stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
+
+      // correct for the massRescale function inside invertQuda
+      if (dslash_type == QUDA_LAPLACE_DSLASH)
+        ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
+    } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
+      QudaParity parity = QUDA_INVALID_PARITY;
+      switch (inv_param.matpc_type) {
+        case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
+        case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
+        default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
+      }
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type);
+    } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
+      stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type);
+      stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
+    } else {
+      errorQuda("Invalid staggered solution type %d", inv_param.solution_type);
     }
+
+    mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double src2 = norm_2(in.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z);
+    double l2r = sqrt(nrm2 / src2);
+
+    printfQuda("Residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, QUDA = %9.6e, "
+                "host = %9.6e\n",
+                inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);
+
+    l2r_max = l2r;
+    hqr_max = hqr;
   }
 
-  return {l2r, inv_param.tol_hq};
+  return {l2r_max, hqr_max};
 }
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index b17238bac4..85fc096ff6 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -110,8 +110,12 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                QudaInvertParam &inv_param, int shift);
+                                quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                QudaInvertParam &inv_param);
+
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
+                                std::vector<quda::ColorSpinorField> &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                QudaInvertParam &inv_param);
 
 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 5230300779..373d78b4f7 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -355,8 +355,6 @@ std::vector<std::array<double, 2>> solve(test_t param)
   // QUDA invert test
   //----------------------------------------------------------------------------
 
-  std::vector<std::array<double, 2>> res(Nsrc);
-
   if (!use_split_grid) {
 
     for (int n = 0; n < Nsrc; n++) {
@@ -374,24 +372,6 @@ std::vector<std::array<double, 2>> solve(test_t param)
       iter[n] = inv_param.iter;
       printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs,
                   inv_param.gflops / inv_param.secs);
-
-      if (verify_results) {
-        if (multishift > 1) {
-          for (int i = 0; i < multishift; i++) {
-            printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-            auto resid = verifyStaggeredInversion(tmp, ref, in[n], out_multishift[n * multishift + i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i);
-
-            // take the HQ residual from the lightest mass
-            if (i == 0) {
-              res[n] = resid;
-            } else {
-                if (resid[0] > res[n][0]) res[n][0] = resid[0];
-            }
-          }
-        } else {
-          res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0);
-        }
-      }
     }
   } else {
     inv_param.num_src = Nsrc;
@@ -414,11 +394,6 @@ std::vector<std::array<double, 2>> solve(test_t param)
     quda::comm_allreduce_max(inv_param.secs);
     printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter,
                 inv_param.secs, inv_param.gflops / inv_param.secs);
-
-    for (int n = 0; n < Nsrc; n++) {
-      if (verify_results)
-        res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0);
-    }
   }
 
   // Free the multigrid solver
@@ -427,6 +402,21 @@ std::vector<std::array<double, 2>> solve(test_t param)
   // Compute timings
   if (Nsrc > 1 && !use_split_grid) performanceStats(time, gflops, iter);
 
+  std::vector<std::array<double, 2>> res(Nsrc);
+  // Perform host side verification of inversion if requested
+  if (verify_results) {
+    for (int n = 0; n < Nsrc; n++) {
+      if (multishift > 1) {
+        printfQuda("\nSource %d:\n", n);
+        // Create an appropriate subset of the full out_multishift vector
+        std::vector<quda::ColorSpinorField> out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift};
+        res[n] = verifyStaggeredInversion(tmp, ref, in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param);
+      } else {
+        res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param);
+      }
+    }
+  }
+
   return res;
 }
 

From 6e339612f377211f7105b5b8b4ff3d02fc7abb36 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 6 Dec 2023 14:57:00 -0800
Subject: [PATCH 25/53] Refactored staggered_eigensolve_test to look more like
 eigensolve_test, working towards a ctest

---
 tests/staggered_eigensolve_test.cpp | 351 ++++++++++++++++++----------
 1 file changed, 229 insertions(+), 122 deletions(-)

diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 797dcb1311..c458cb4aaf 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -15,125 +15,116 @@
 #include <llfat_utils.h>
 #include <qio_field.h>
 
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
+QudaGaugeParam gauge_param;
+QudaInvertParam eig_inv_param;
+QudaEigParam eig_param;
 
-void display_test_info()
+// if "--enable-testing true" is passed, we run the tests defined in here
+//#include <staggered_eigensolve_test_gtest.hpp>
+
+void display_test_info(QudaEigParam &param)
 {
   printfQuda("running the following test:\n");
-  printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon S_dimension T_dimension\n");
-  printfQuda("%s   %s             %s            %s         %d/%d/%d          %d \n", get_prec_str(prec),
-             get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy),
-             xdim, ydim, zdim, tdim);
+
+  printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon S_dimension T_dimension Ls_dimension\n");
+  printfQuda("%s   %s             %s            %s            %d/%d/%d          %d         %d\n", get_prec_str(prec),
+             get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim,
+             tdim, Lsdim);
 
   printfQuda("\n   Eigensolver parameters\n");
-  printfQuda(" - solver mode %s\n", get_eig_type_str(eig_type));
-  printfQuda(" - spectrum requested %s\n", get_eig_spectrum_str(eig_spectrum));
-  if (eig_type == QUDA_EIG_BLK_TR_LANCZOS) printfQuda(" - eigenvector block size %d\n", eig_block_size);
-  printfQuda(" - number of eigenvectors requested %d\n", eig_n_conv);
-  printfQuda(" - size of eigenvector search space %d\n", eig_n_ev);
-  printfQuda(" - size of Krylov space %d\n", eig_n_kr);
-  printfQuda(" - solver tolerance %e\n", eig_tol);
-  printfQuda(" - convergence required (%s)\n", eig_require_convergence ? "true" : "false");
-  if (eig_compute_svd) {
+  printfQuda(" - solver mode %s\n", get_eig_type_str(param.eig_type));
+  printfQuda(" - spectrum requested %s\n", get_eig_spectrum_str(param.spectrum));
+  if (param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) printfQuda(" - eigenvector block size %d\n", param.block_size);
+  printfQuda(" - number of eigenvectors requested %d\n", param.n_conv);
+  printfQuda(" - size of eigenvector search space %d\n", param.n_ev);
+  printfQuda(" - size of Krylov space %d\n", param.n_kr);
+  printfQuda(" - solver tolerance %e\n", param.tol);
+  printfQuda(" - convergence required (%s)\n", param.require_convergence ? "true" : "false");
+  if (param.compute_svd) {
     printfQuda(" - Operator: MdagM. Will compute SVD of M\n");
     printfQuda(" - ***********************************************************\n");
     printfQuda(" - **** Overriding any previous choices of operator type. ****\n");
     printfQuda(" - ****    SVD demands normal operator, will use MdagM    ****\n");
     printfQuda(" - ***********************************************************\n");
   } else {
-    printfQuda(" - Operator: daggered (%s) , norm-op (%s)\n", eig_use_dagger ? "true" : "false",
-               eig_use_normop ? "true" : "false");
+    printfQuda(" - Operator: daggered (%s) , norm-op (%s), even-odd pc (%s)\n", param.use_dagger ? "true" : "false",
+               param.use_norm_op ? "true" : "false", param.use_pc ? "true" : "false");
   }
-  if (eig_use_poly_acc) {
-    printfQuda(" - Chebyshev polynomial degree %d\n", eig_poly_deg);
-    printfQuda(" - Chebyshev polynomial minumum %e\n", eig_amin);
-    if (eig_amax < 0)
+  if (param.use_poly_acc) {
+    printfQuda(" - Chebyshev polynomial degree %d\n", param.poly_deg);
+    printfQuda(" - Chebyshev polynomial minumum %e\n", param.a_min);
+    if (param.a_max <= 0)
       printfQuda(" - Chebyshev polynomial maximum will be computed\n");
     else
-      printfQuda(" - Chebyshev polynomial maximum %e\n\n", eig_amax);
+      printfQuda(" - Chebyshev polynomial maximum %e\n\n", param.a_max);
   }
-
   printfQuda("Grid partition info:     X  Y  Z  T\n");
   printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
              dimPartitioned(3));
 }
 
-int main(int argc, char **argv)
-{
-  // Set defaults
-  setQudaStaggeredDefaultInvTestParams();
-
-  auto app = make_app();
-  add_eigen_option_group(app);
-
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
-  }
-
-  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
-  initComms(argc, argv, gridsize_from_cmdline);
-
-  // Set values for precisions via the command line.
-  setQudaPrecisions();
-
-  // Only these fermions are supported in this file
-  if (is_laplace_enabled) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
-      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
-  } else {
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
-      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
-      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
-  }
-
-  display_test_info();
+GaugeField cpuFatQDP = {};
+GaugeField cpuLongQDP = {};
+GaugeField cpuFatMILC = {};
+GaugeField cpuLongMILC = {};
 
+void init()
+{
   // Set QUDA internal parameters
-  QudaGaugeParam gauge_param = newQudaGaugeParam();
+  gauge_param = newQudaGaugeParam();
   setStaggeredGaugeParam(gauge_param);
+
   // Though no inversions are performed, the inv_param
   // structure contains all the information we need to
-  // construct the dirac operator. We encapsualte the
-  // inv_param structure inside the eig_param structure
-  // to avoid any confusion
-  QudaInvertParam eig_inv_param = newQudaInvertParam();
+  // construct the dirac operator.
+  eig_inv_param = newQudaInvertParam();
   setStaggeredInvertParam(eig_inv_param);
-  QudaEigParam eig_param = newQudaEigParam();
-  setEigParam(eig_param);
-  // We encapsulate the eigensolver parameters inside the invert parameter structure
-  eig_param.invert_param = &eig_inv_param;
-
-  if (eig_param.arpack_check && !(prec == QUDA_DOUBLE_PRECISION)) {
-    errorQuda("ARPACK check only available in double precision");
-  }
 
-  initQuda(device_ordinal);
+  eig_param = newQudaEigParam();
+  // We encapsualte the inv_param structure inside the eig_param structure
+  eig_param.invert_param = &eig_inv_param;
+  setEigParam(eig_param);
 
   setDims(gauge_param.X);
-  dw_setDims(gauge_param.X, 1); // so we can use 5-d indexing from dwf
+  dw_setDims(gauge_param.X, 1);
 
   // Staggered Gauge construct START
   //-----------------------------------------------------------------------------------
-  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *milc_fatlink = nullptr;
-  void *milc_longlink = nullptr;
-
-  for (int dir = 0; dir < 4; dir++) {
-    qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    qdp_fatlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    qdp_longlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-  }
-  milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-  milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-
+  // Allocate host staggered gauge fields
+  gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ?
+    QUDA_SU3_LINKS :
+    QUDA_ASQTAD_FAT_LINKS;
+  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
+  gauge_param.location = QUDA_CPU_FIELD_LOCATION;
+
+  GaugeFieldParam cpuParam(gauge_param);
+  cpuParam.order = QUDA_QDP_GAUGE_ORDER;
+  cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
+  cpuParam.create = QUDA_NULL_FIELD_CREATE;
+  GaugeField cpuIn = GaugeField(cpuParam);
+  cpuFatQDP = GaugeField(cpuParam);
+  cpuParam.order = QUDA_MILC_GAUGE_ORDER;
+  cpuFatMILC = GaugeField(cpuParam);
+
+  cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS;
+  cpuParam.nFace = 3;
+  cpuParam.order = QUDA_QDP_GAUGE_ORDER;
+  cpuLongQDP = GaugeField(cpuParam);
+  cpuParam.order = QUDA_MILC_GAUGE_ORDER;
+  cpuLongMILC = GaugeField(cpuParam);
+
+  void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
+  void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
+  void *qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)};
   constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, true);
 
+  // Reorder gauge fields to MILC order
+  cpuFatMILC = cpuFatQDP;
+  cpuLongMILC = cpuLongQDP;
+
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
+  // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the
+  // gauge fields with different parameters.
   double plaq[3];
   computeStaggeredPlaquetteQDPOrder(qdp_inlink, plaq, gauge_param, dslash_type);
   printfQuda("Computed plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
@@ -144,60 +135,176 @@ int main(int argc, char **argv)
     printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
   }
 
-  // Reorder gauge fields to MILC order
-  reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-  reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+  freeGaugeQuda();
+
+  loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param);
 
-  loadFatLongGaugeQuda(milc_fatlink, milc_longlink, gauge_param);
+  // now copy back to QDP aliases, since these are used for the reference dslash
+  cpuFatQDP = cpuFatMILC;
+  cpuLongQDP = cpuLongMILC;
+  // ensure QDP alias has exchanged ghosts
+  cpuFatQDP.exchangeGhost();
+  cpuLongQDP.exchangeGhost();
 
   // Staggered Gauge construct END
   //-----------------------------------------------------------------------------------
+}
 
+//std::vector<double> eigensolve(test_t test_param)
+std::vector<double> eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd, QudaEigSpectrumType spectrum)
+{
+  // Collect testing parameters from gtest
+  eig_param.eig_type = eig_type; //::testing::get<0>(test_param);
+  eig_param.use_norm_op = use_norm_op; //::testing::get<1>(test_param);
+  eig_param.use_pc = use_pc; //::testing::get<2>(test_param);
+  eig_param.compute_svd = compute_svd; //::testing::get<3>(test_param);
+  eig_param.spectrum = spectrum; //::testing::get<4>(test_param);
+
+  if (eig_param.use_pc)
+    eig_inv_param.solution_type = QUDA_MATPC_SOLUTION;
+  else
+    eig_inv_param.solution_type = QUDA_MAT_SOLUTION;
+
+  // For gtest testing, we prohibit the use of polynomial acceleration as
+  // the fine tuning required can inhibit convergence of an otherwise
+  // perfectly good algorithm. We also have a default value of 4
+  // for the block size in Block TRLM, and 4 for the batched rotation.
+  // The user may change these values via the command line:
+  // --eig-block-size
+  // --eig-batched-rotate
+  if (enable_testing) {
+    eig_use_poly_acc = false;
+    eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE;
+    eig_block_size != 4 ? eig_param.block_size = eig_block_size : eig_param.block_size = 4;
+    eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 4;
+  }
+
+  logQuda(QUDA_SUMMARIZE, "Action = %s, Solver = %s, norm-op = %s, even-odd = %s, with SVD = %s, spectrum = %s\n",
+          get_dslash_str(dslash_type),
+          get_eig_type_str(eig_param.eig_type), eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? "true" : "false",
+          eig_param.use_pc == QUDA_BOOLEAN_TRUE ? "true" : "false",
+          eig_param.compute_svd == QUDA_BOOLEAN_TRUE ? "true" : "false", get_eig_spectrum_str(eig_param.spectrum));
+
+  display_test_info(eig_param);
+
+  // Vector construct START
+  //----------------------------------------------------------------------------
   // Host side arrays to store the eigenpairs computed by QUDA
-  void **host_evecs = (void **)safe_malloc(eig_n_conv * sizeof(void *));
-  for (int i = 0; i < eig_n_conv; i++) {
-    host_evecs[i] = (void *)safe_malloc(V * stag_spinor_site_size * eig_inv_param.cpu_prec);
+  int n_eig = eig_n_conv;
+  if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2;
+  std::vector<quda::ColorSpinorField> evecs(n_eig);
+  quda::ColorSpinorParam cs_param;
+  constructStaggeredTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param);
+  // Void pointers to host side arrays, compatible with the QUDA interface.
+  std::vector<void *> host_evecs_ptr(n_eig);
+  // Allocate host side memory and pointers
+  for (int i = 0; i < n_eig; i++) {
+    evecs[i] = quda::ColorSpinorField(cs_param);
+    host_evecs_ptr[i] = evecs[i].data();
   }
-  double _Complex *host_evals = (double _Complex *)safe_malloc(eig_param.n_ev * sizeof(double _Complex));
 
-  double time = 0.0;
+  // Complex eigenvalues
+  std::vector<__complex__ double> evals(eig_n_conv);
+  // Vector construct END
+  //----------------------------------------------------------------------------
 
-  // QUDA eigensolver test
+  // QUDA eigensolver test BEGIN
   //----------------------------------------------------------------------------
-  if ((solve_type == QUDA_DIRECT_SOLVE && solution_type == QUDA_MAT_SOLUTION) ||
-    (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type == QUDA_MATPC_SOLUTION) ||
-    (solve_type == QUDA_NORMOP_SOLVE && solution_type == QUDA_MATDAG_MAT_SOLUTION)) {
-    // This function returns the host_evecs and host_evals pointers, populated with
-    // the requested data, at the requested prec. All the information needed to
-    // perfom the solve is in the eig_param container.
-    // If eig_param.arpack_check == true and precision is double, the routine will
-    // use ARPACK rather than the GPU.
-
-    time = -((double)clock());
-    eigensolveQuda(host_evecs, host_evals, &eig_param);
-    time += (double)clock();
-
-    printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", time / CLOCKS_PER_SEC);
-  } else {
-    errorQuda("Unsupported combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type));
+  // This function returns the host_evecs and host_evals pointers, populated with the
+  // requested data, at the requested prec. All the information needed to perfom the
+  // solve is in the eig_param container. If eig_param.arpack_check == true and
+  // precision is double, the routine will use ARPACK rather than the GPU.
+  quda::host_timer_t host_timer;
+  host_timer.start();
+  eigensolveQuda(host_evecs_ptr.data(), evals.data(), &eig_param);
+  host_timer.stop();
+  printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last());
+
+  // Perform host side verification of eigenvector if requested.
+  // ...
+
+  std::vector<double> residua(eig_n_conv, 0.0);
+  return residua;
+  // QUDA eigensolver test COMPLETE
+  //----------------------------------------------------------------------------
+}
+
+void cleanup()
+{
+  cpuFatQDP = {};
+  cpuLongQDP = {};
+  cpuFatMILC = {};
+  cpuLongMILC = {};
+}
+
+int main(int argc, char **argv)
+{
+  // Set defaults
+  setQudaStaggeredDefaultInvTestParams();
+
+  auto app = make_app();
+  add_eigen_option_group(app);
+  //add_testing_option_group(app);
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
+  setVerbosity(verbosity);
 
-  } // switch
+  // Set values for precisions via the command line.
+  setQudaPrecisions();
+
+  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
+  initComms(argc, argv, gridsize_from_cmdline);
+
+  initRand();
+
+  // Only these fermions are supported in this file
+  if (is_laplace_enabled) {
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  } else {
+    if (dslash_type == QUDA_LAPLACE_DSLASH)
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+  }
 
-  // Deallocate host memory
-  for (int i = 0; i < eig_n_conv; i++) host_free(host_evecs[i]);
-  host_free(host_evecs);
-  host_free(host_evals);
+  if (eig_param.arpack_check && !(prec == QUDA_DOUBLE_PRECISION)) {
+    errorQuda("ARPACK check only available in double precision");
+  }
 
-  // Clean up gauge fields.
-  for (int dir = 0; dir < 4; dir++) {
-    host_free(qdp_inlink[dir]);
-    host_free(qdp_fatlink[dir]);
-    host_free(qdp_longlink[dir]);
+  // Sanity check combinations of solve type and solution type
+  if ((solve_type == QUDA_DIRECT_SOLVE && solution_type != QUDA_MAT_SOLUTION) ||
+    (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type != QUDA_MATPC_SOLUTION) ||
+    (solve_type == QUDA_NORMOP_SOLVE && solution_type != QUDA_MATDAG_MAT_SOLUTION)) {
+    errorQuda("Invalid combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type));
   }
 
-  host_free(milc_fatlink);
-  host_free(milc_longlink);
+  initQuda(device_ordinal);
+
+  init();
+
+  int result = 0;
+  //if (enable_testing) { // tests are defined in invert_test_gtest.hpp
+    //::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
+    //if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
+    //result = RUN_ALL_TESTS();
+  //} else {
+    //eigensolve(
+    //  test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum});
+    eigensolve(eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum);
+  //}
 
+  cleanup();
+
+  // Memory clean-up
+  freeGaugeQuda();
+
+  // Finalize the QUDA library
   endQuda();
   finalizeComms();
+
+  return result;
 }

From 829ce62c6e7fdd5bba8429ef77b371223f42d1ab Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 6 Dec 2023 21:09:30 -0800
Subject: [PATCH 26/53] Abstracted the staggered eigensolver test into a gtest.
 The tests and their parameters converge (slowly), but there is no verify yet.

---
 tests/staggered_eigensolve_test.cpp       |  61 +++++---
 tests/staggered_eigensolve_test_gtest.hpp | 176 ++++++++++++++++++++++
 tests/staggered_invert_test.cpp           |  28 ++--
 3 files changed, 229 insertions(+), 36 deletions(-)
 create mode 100644 tests/staggered_eigensolve_test_gtest.hpp

diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index c458cb4aaf..aab4a99c1f 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -20,7 +20,7 @@ QudaInvertParam eig_inv_param;
 QudaEigParam eig_param;
 
 // if "--enable-testing true" is passed, we run the tests defined in here
-//#include <staggered_eigensolve_test_gtest.hpp>
+#include <staggered_eigensolve_test_gtest.hpp>
 
 void display_test_info(QudaEigParam &param)
 {
@@ -150,15 +150,14 @@ void init()
   //-----------------------------------------------------------------------------------
 }
 
-//std::vector<double> eigensolve(test_t test_param)
-std::vector<double> eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd, QudaEigSpectrumType spectrum)
+std::vector<double> eigensolve(test_t test_param)
 {
   // Collect testing parameters from gtest
-  eig_param.eig_type = eig_type; //::testing::get<0>(test_param);
-  eig_param.use_norm_op = use_norm_op; //::testing::get<1>(test_param);
-  eig_param.use_pc = use_pc; //::testing::get<2>(test_param);
-  eig_param.compute_svd = compute_svd; //::testing::get<3>(test_param);
-  eig_param.spectrum = spectrum; //::testing::get<4>(test_param);
+  eig_param.eig_type = ::testing::get<0>(test_param);
+  eig_param.use_norm_op = ::testing::get<1>(test_param);
+  eig_param.use_pc = ::testing::get<2>(test_param);
+  eig_param.compute_svd = ::testing::get<3>(test_param);
+  eig_param.spectrum = ::testing::get<4>(test_param);
 
   if (eig_param.use_pc)
     eig_inv_param.solution_type = QUDA_MATPC_SOLUTION;
@@ -175,7 +174,6 @@ std::vector<double> eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, Qu
   if (enable_testing) {
     eig_use_poly_acc = false;
     eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE;
-    eig_block_size != 4 ? eig_param.block_size = eig_block_size : eig_param.block_size = 4;
     eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 4;
   }
 
@@ -185,7 +183,8 @@ std::vector<double> eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, Qu
           eig_param.use_pc == QUDA_BOOLEAN_TRUE ? "true" : "false",
           eig_param.compute_svd == QUDA_BOOLEAN_TRUE ? "true" : "false", get_eig_spectrum_str(eig_param.spectrum));
 
-  display_test_info(eig_param);
+  if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE))
+    display_test_info(eig_param);
 
   // Vector construct START
   //----------------------------------------------------------------------------
@@ -239,12 +238,13 @@ void cleanup()
 
 int main(int argc, char **argv)
 {
+  ::testing::InitGoogleTest(&argc, argv);
   // Set defaults
   setQudaStaggeredDefaultInvTestParams();
 
   auto app = make_app();
   add_eigen_option_group(app);
-  //add_testing_option_group(app);
+  add_testing_option_group(app);
   try {
     app->parse(argc, argv);
   } catch (const CLI::ParseError &e) {
@@ -284,18 +284,39 @@ int main(int argc, char **argv)
 
   initQuda(device_ordinal);
 
+  if (enable_testing) {
+    // We need to force a well-behaved operator + reasonable convergence, otherwise
+    // the staggered tests will fail. These checks are designed to be consistent
+    // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked"
+    bool changes = false;
+    if (!compute_fatlong) { compute_fatlong = true; changes = true; }
+
+    double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-4 : 1e-5;
+    if (eig_tol != expected_tol) { eig_tol = expected_tol; changes = true; }
+    if (niter != 1000) { niter = 1000; changes = true; }
+    if (eig_n_kr != 256) { eig_n_kr = 256; changes = true; }
+    if (eig_block_size != 8) { eig_block_size = 8; }
+
+    if (changes) {
+      printfQuda("For gtest, various defaults are changed:\n");
+      printfQuda("  --compute-fat-long true\n");
+      printfQuda("  --eig-tol (1e-5 for double, 1e-4 for single)\n");
+      printfQuda("  --niter 1000\n");
+      printfQuda("  --eig-n-kr 256\n");
+    }
+  }
+
   init();
 
   int result = 0;
-  //if (enable_testing) { // tests are defined in invert_test_gtest.hpp
-    //::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
-    //if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
-    //result = RUN_ALL_TESTS();
-  //} else {
-    //eigensolve(
-    //  test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum});
-    eigensolve(eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum);
-  //}
+  if (enable_testing) { // tests are defined in invert_test_gtest.hpp
+    ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
+    if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
+    result = RUN_ALL_TESTS();
+  } else {
+    eigensolve(
+      test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum});
+  }
 
   cleanup();
 
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
new file mode 100644
index 0000000000..06cf712e07
--- /dev/null
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -0,0 +1,176 @@
+#include <gtest/gtest.h>
+
+using test_t = ::testing::tuple<QudaEigType, QudaBoolean, QudaBoolean, QudaBoolean, QudaEigSpectrumType>;
+
+class StaggeredEigensolveTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  test_t param;
+
+public:
+  StaggeredEigensolveTest() : param(GetParam()) { }
+};
+
+// Get the solve type that this combination corresponds to
+QudaSolveType get_solve_type(QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd) {
+  if (use_norm_op == QUDA_BOOLEAN_FALSE && use_pc == QUDA_BOOLEAN_TRUE && compute_svd == QUDA_BOOLEAN_FALSE)
+    return QUDA_DIRECT_PC_SOLVE;
+  else if (use_norm_op == QUDA_BOOLEAN_TRUE && use_pc == QUDA_BOOLEAN_FALSE && compute_svd == QUDA_BOOLEAN_TRUE)
+    return QUDA_NORMOP_SOLVE;
+  else if (use_norm_op == QUDA_BOOLEAN_FALSE && use_pc == QUDA_BOOLEAN_FALSE && compute_svd == QUDA_BOOLEAN_FALSE)
+    return QUDA_DIRECT_SOLVE;
+  else
+    return QUDA_INVALID_SOLVE;
+}
+
+bool skip_test(test_t test_param)
+{
+  auto eig_type = ::testing::get<0>(test_param);
+  auto use_norm_op = ::testing::get<1>(test_param);
+  auto use_pc = ::testing::get<2>(test_param);
+  auto compute_svd = ::testing::get<3>(test_param);
+  auto spectrum = ::testing::get<4>(test_param);
+
+  // Reverse engineer the operator type
+  QudaSolveType combo_solve_type = get_solve_type(use_norm_op, use_pc, compute_svd);
+  if (combo_solve_type == QUDA_DIRECT_PC_SOLVE) {
+    // matpc
+
+    // this is only legal for the staggered and asqtad op
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+      return true;
+
+    // we can only compute the real part for Lanczos, and real or magnitude for Arnoldi
+    switch (eig_type) {
+    case QUDA_EIG_TR_LANCZOS:
+    case QUDA_EIG_BLK_TR_LANCZOS:
+      if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true;
+      break;
+    case QUDA_EIG_IR_ARNOLDI:
+      if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true;
+      break;
+    default: break;
+    }
+  } else if (combo_solve_type == QUDA_NORMOP_SOLVE) {
+    // matdag_mat
+    
+    // this is only legal for the staggered and asqtad op
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+      return true;
+
+    switch (eig_type) {
+    case QUDA_EIG_TR_LANCZOS:
+    case QUDA_EIG_BLK_TR_LANCZOS:
+      if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true;
+      break;
+    case QUDA_EIG_IR_ARNOLDI:
+      //if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true;
+      return true; // we skip this because it takes an unnecessarily long time and it's covered elsewhere
+      break;
+    default: return true; break;
+    }
+  } else if (combo_solve_type == QUDA_DIRECT_SOLVE) {
+    // mat
+    
+    switch (dslash_type) {
+    case QUDA_STAGGERED_DSLASH:
+      // only Arnoldi, imaginary part or magnitude works (real part is degenerate)
+      // We skip SM because it takes an unnecessarily long time and it's
+      // covered by HISQ
+      if (eig_type != QUDA_EIG_IR_ARNOLDI) return true;
+      if (spectrum != QUDA_SPECTRUM_LI_EIG && spectrum != QUDA_SPECTRUM_SI_EIG &&
+            spectrum != QUDA_SPECTRUM_LM_EIG) return true;
+      break;
+    case QUDA_ASQTAD_DSLASH:
+      // only Arnoldi, imaginary part or magnitude works (real part is degenerate)
+      if (eig_type != QUDA_EIG_IR_ARNOLDI) return true;
+      if (spectrum == QUDA_SPECTRUM_LR_EIG || spectrum == QUDA_SPECTRUM_SR_EIG) return true;
+      break;
+    case QUDA_LAPLACE_DSLASH:
+      switch (eig_type) {
+      case QUDA_EIG_TR_LANCZOS:
+      case QUDA_EIG_BLK_TR_LANCZOS:
+        if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true;
+        break;
+      case QUDA_EIG_IR_ARNOLDI:
+        if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true;
+        break;
+      default: return true; break;
+      }
+      break;
+    default: return true; break;
+    }
+  }
+
+  return false;
+}
+
+std::vector<double> eigensolve(test_t test_param);
+
+TEST_P(StaggeredEigensolveTest, verify)
+{
+  if (skip_test(GetParam())) GTEST_SKIP();
+  double factor = 1.0;
+  // The IRAM eigensolver will sometimes report convergence with tolerances slightly
+  // higher than requested. The same phenomenon occurs in ARPACK. This factor
+  // prevents failure when IRAM has solved to say 2e-6 when 1e-6 is requested.
+  // The solution to avoid this is to use a Krylov space (eig-n-kr) about 3-4 times the
+  // size of the search space (eig-n-ev), or use a well chosen Chebyshev polynomial,
+  // or use a tighter than necessary tolerance.
+  if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10;
+  auto tol = factor * eig_param.tol;
+  for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol);
+}
+
+std::string gettestname(::testing::TestParamInfo<test_t> param)
+{
+  std::string name;
+  name += get_eig_type_str(::testing::get<0>(param.param)) + std::string("_");
+  name += (::testing::get<1>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("normop") : std::string("direct"))
+    + std::string("_");
+  name += (::testing::get<2>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("evenodd") : std::string("full"))
+    + std::string("_");
+  name += (::testing::get<3>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("withSVD") : std::string("noSVD"))
+    + std::string("_");
+  name += get_eig_spectrum_str(::testing::get<4>(param.param));
+  return name;
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+
+// Can solve hermitian systems
+auto hermitian_solvers = Values(QUDA_EIG_TR_LANCZOS, QUDA_EIG_BLK_TR_LANCZOS, QUDA_EIG_IR_ARNOLDI);
+
+// Can solve non-hermitian systems
+auto non_hermitian_solvers = Values(QUDA_EIG_IR_ARNOLDI);
+
+// Eigensolver spectrum types
+auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG);
+auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG,
+                                     QUDA_SPECTRUM_SM_EIG, QUDA_SPECTRUM_LI_EIG, QUDA_SPECTRUM_SI_EIG);
+
+//using test_t = ::testing::tuple<QudaEigType,          // different types of Lanczos/Arnoldi
+//                                QudaBoolean,          // Norm op or not
+//                                QudaBoolean,          // Preconditioned op or not
+//                                QudaBoolean,          // SVD or not
+//                                QudaEigSpectrumType>; // Largest real, smallest real, etc
+
+// Preconditioned direct operators, which are HPD for staggered!
+INSTANTIATE_TEST_SUITE_P(DirectEvenOdd, StaggeredEigensolveTest,
+                         ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_TRUE),
+                                            Values(QUDA_BOOLEAN_FALSE), hermitian_spectrum),
+                         gettestname);
+
+// full system normal solve
+INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredEigensolveTest,
+                         ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_FALSE),
+                                            Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum),
+                         gettestname);
+
+
+// full system direct solve
+INSTANTIATE_TEST_SUITE_P(DirectFull, StaggeredEigensolveTest,
+                         ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
+                                            Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum),
+                         gettestname);
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 373d78b4f7..8f4236d16c 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -490,24 +490,20 @@ int main(int argc, char **argv)
     // We need to force a well-behaved operator + reasonable convergence, otherwise
     // the staggered tests will fail. These checks are designed to be consistent
     // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked"
-    if (!compute_fatlong) {
-      warningQuda("compute_fatlong = %d , expected value %d , overriding", compute_fatlong, true);
-      compute_fatlong = true;
-    }
+    bool changes = false;
+    if (!compute_fatlong) { compute_fatlong = true; changes = true; }
 
     double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-5 : 1e-6;
-    if (tol != expected_tol) {
-      warningQuda("tol = %e , expected value %e , overriding", tol, expected_tol);
-      tol = expected_tol;
-    }
-    if (tol_hq != expected_tol) {
-      warningQuda("tol_hq = %e , expected value %e , overriding", tol_hq, expected_tol);
-      tol_hq = 1e-5;
-    }
-
-    if (niter != 1000) {
-      warningQuda("niter = %d , expected value %d , overriding", niter, 1000);
-      compute_fatlong = 1000;
+    if (tol != expected_tol) { tol = expected_tol; changes = true; }
+    if (tol_hq != expected_tol) { tol_hq = expected_tol; changes = true; }
+    if (niter != 1000) { niter = 1000; changes = true; }
+
+    if (changes) {
+      printfQuda("For gtest, various defaults are changed:\n");
+      printfQuda("  --compute-fat-long true\n");
+      printfQuda("  --tol (1e-6 for double, 1e-5 for single)\n");
+      printfQuda("  --tol-hq (1e-6 for double, 1e-5 for single)\n");
+      printfQuda("  --niter 1000\n");
     }
   }
 

From 4837bc64f6726afe62c99146efa5db374088a00c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 6 Dec 2023 22:14:38 -0800
Subject: [PATCH 27/53] Added verify functions for eigenvectors and singular
 vectors

---
 tests/host_reference/dslash_reference.cpp | 79 +++++++++++++++++++++++
 tests/host_reference/dslash_reference.h   |  6 ++
 tests/staggered_eigensolve_test.cpp       | 12 ++++
 3 files changed, 97 insertions(+)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index b2688b0094..21b42247ec 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -839,3 +839,82 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
 
   return {l2r_max, hqr_max};
 }
+
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i,
+                                   QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link)
+{
+  QudaInvertParam& inv_param = *(eig_param.invert_param);
+  int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
+  bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false);
+  bool normop = (eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? true : false);
+  double mass = inv_param.mass;
+
+  // Reverse engineer a "solution_type" to help determine which host dslash needs to be applied
+  QudaSolutionType sol_type = QUDA_INVALID_SOLUTION;
+  if (normop) {
+    if (use_pc) errorQuda("The normal preconditioned staggered op is not supported");
+    else sol_type = QUDA_MATDAG_MAT_SOLUTION;
+  } else {
+    if (use_pc) sol_type = QUDA_MATPC_SOLUTION;
+    else sol_type = QUDA_MAT_SOLUTION;
+  }
+
+  // Create temporary spinors
+  quda::ColorSpinorParam csParam(spinor);
+  quda::ColorSpinorField ref(csParam);
+  quda::ColorSpinorField tmp(csParam);
+
+  if (sol_type == QUDA_MAT_SOLUTION) {
+    stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type);
+  } else if (sol_type == QUDA_MATPC_SOLUTION) {
+    QudaParity parity = QUDA_INVALID_PARITY;
+    switch (inv_param.matpc_type) {
+      case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
+      case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
+      default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
+    }
+    stag_matpc(ref, fat_link, long_link, spinor, mass, 0, tmp, parity, dslash_type);
+  } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) {
+    stag_mat(tmp, fat_link, long_link, spinor, mass, dagger, dslash_type);
+    stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
+  }
+
+  // Compute M * x - \lambda * x
+  caxpy(-lambda, spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  double l2r = sqrt(nrm2 / src2);
+
+  printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r);
+
+  return l2r;
+}
+
+double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i,
+                                         QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link)
+{
+  QudaInvertParam& inv_param = *(eig_param.invert_param);
+  int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
+  bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false);
+  double mass = inv_param.mass;
+
+  if (use_pc)
+    errorQuda("The SVD of the preconditioned staggered op is not supported");
+
+  // Create temporary spinors
+  quda::ColorSpinorParam csParam(spinor_left);
+  quda::ColorSpinorField ref(csParam);
+
+  // Only `mat` is used here
+  stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type);
+
+  // Compute M * x_left - \sigma * x_right
+  caxpy(-sigma, spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  double src2 = norm_2(spinor_left.data(), spinor_left.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  double l2r = sqrt(nrm2 / src2);
+
+  printfQuda("Singular vector pair %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r);
+
+  return l2r;
+}
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 85fc096ff6..6331fbb65a 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -117,6 +117,12 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
                                 std::vector<quda::ColorSpinorField> &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link,
                                 QudaInvertParam &inv_param);
 
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i,
+                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
+
+double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i,
+                                         QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
+
 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
 //
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index aab4a99c1f..d71bb7cfb2 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -223,6 +223,18 @@ std::vector<double> eigensolve(test_t test_param)
   // ...
 
   std::vector<double> residua(eig_n_conv, 0.0);
+  // Perform host side verification of eigenvector if requested.
+  if (verify_results) {
+    for (int i = 0; i < eig_n_conv; i++) {
+      if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) {
+        double _Complex sigma = evals[i];
+        residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP, cpuLongQDP);
+      } else {
+        double _Complex lambda = evals[i];
+        residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP);
+      }
+    }
+  }
   return residua;
   // QUDA eigensolver test COMPLETE
   //----------------------------------------------------------------------------

From 30eb5eec81d79e2af2abb63c9a0e7663edc72704 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 6 Dec 2023 22:55:05 -0800
Subject: [PATCH 28/53] Added a ctest for staggered eigensolves, fixed the
 verify function

---
 tests/CMakeLists.txt                      | 44 ++++++++++++++++++++++-
 tests/staggered_eigensolve_test_gtest.hpp |  3 +-
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3955de2cb1..923df3d04c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1193,7 +1193,7 @@ foreach(prec IN LISTS TEST_PRECS)
   endif()
 endforeach(prec)
 
-# Eigensolves
+# Wilson-type eigensolves
 foreach(prec IN LISTS TEST_PRECS)
 
   if(${prec} STREQUAL "double")
@@ -1360,6 +1360,48 @@ foreach(prec IN LISTS TEST_PRECS)
   endif()
 endforeach(prec)
 
+# Staggered-type eigensolves
+foreach(prec IN LISTS TEST_PRECS)
+
+  # These require looser tolerances to keep iterations to solution in check
+  if(${prec} STREQUAL "double")
+    set(tol 1e-6)
+  elseif(${prec} STREQUAL "single")
+    set(tol 1e-5)
+  endif()
+
+  if(QUDA_DIRAC_STAGGERED)
+    # --compute-fat-long true is necessary to get well-behaved fields
+
+    add_test(NAME eigensolve_test_staggered_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type staggered --compute-fat-long true
+      --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256
+      --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
+      --enable-testing true
+      --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
+
+    add_test(NAME eigensolve_test_asqtad_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type asqtad --compute-fat-long true
+      --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256
+      --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
+      --enable-testing true
+      --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
+
+    if (QUDA_LAPLACE)
+      add_test(NAME eigensolve_test_laplace_${prec}
+        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
+        --dslash-type laplace --compute-fat-long true
+        --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256
+        --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
+        --enable-testing true
+        --gtest_output=xml:staggered_eigensolve_test_laplace_${prec}.xml)
+    endif()
+  endif()
+endforeach(prec)
+
+
 if(QUDA_DIRAC_STAGGERED)
   add_test(NAME hisq_stencil
            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_stencil_ctest> ${MPIEXEC_POSTFLAGS}
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index 06cf712e07..5c5f8e3890 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -117,7 +117,8 @@ TEST_P(StaggeredEigensolveTest, verify)
   // The solution to avoid this is to use a Krylov space (eig-n-kr) about 3-4 times the
   // size of the search space (eig-n-ev), or use a well chosen Chebyshev polynomial,
   // or use a tighter than necessary tolerance.
-  if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10;
+  auto eig_type = ::testing::get<0>(GetParam());
+  if (eig_type == QUDA_EIG_IR_ARNOLDI || eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10;
   auto tol = factor * eig_param.tol;
   for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol);
 }

From e161cdc1b5193f86fbfe2180d44c2e7915756544 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 7 Dec 2023 09:05:27 -0800
Subject: [PATCH 29/53] All sorts of cleanup, moved various
 is_*_[solve/solution/etc] routines into host utils

---
 tests/eigensolve_test_gtest.hpp           |  11 --
 tests/host_reference/dslash_reference.cpp |   2 +-
 tests/invert_test_gtest.hpp               |  72 +------------
 tests/staggered_dslash_ctest.cpp          |  10 +-
 tests/staggered_dslash_test.cpp           |   8 +-
 tests/staggered_dslash_test_utils.h       |   8 +-
 tests/staggered_eigensolve_test.cpp       |   6 +-
 tests/staggered_eigensolve_test_gtest.hpp |   4 +-
 tests/staggered_invert_test.cpp           |   8 +-
 tests/staggered_invert_test_gtest.hpp     |  89 ++--------------
 tests/utils/host_utils.cpp                | 121 +++++++++++++++++++++-
 tests/utils/host_utils.h                  |  19 +++-
 tests/utils/staggered_host_utils.cpp      |   2 +-
 13 files changed, 170 insertions(+), 190 deletions(-)

diff --git a/tests/eigensolve_test_gtest.hpp b/tests/eigensolve_test_gtest.hpp
index cd07ca401f..a872963413 100644
--- a/tests/eigensolve_test_gtest.hpp
+++ b/tests/eigensolve_test_gtest.hpp
@@ -11,17 +11,6 @@ class EigensolveTest : public ::testing::TestWithParam<test_t>
   EigensolveTest() : param(GetParam()) { }
 };
 
-bool is_chiral(QudaDslashType type)
-{
-  switch (type) {
-  case QUDA_DOMAIN_WALL_DSLASH:
-  case QUDA_DOMAIN_WALL_4D_DSLASH:
-  case QUDA_MOBIUS_DWF_DSLASH:
-  case QUDA_MOBIUS_DWF_EOFA_DSLASH: return true;
-  default: return false;
-  }
-}
-
 bool skip_test(test_t param)
 {
   // dwf-style solves must use a normal solver
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 21b42247ec..0b461076fd 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -806,7 +806,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
       stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
 
       // correct for the massRescale function inside invertQuda
-      if (dslash_type == QUDA_LAPLACE_DSLASH)
+      if (is_laplace(dslash_type))
         ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
     } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
       QudaParity parity = QUDA_INVALID_PARITY;
diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index 27c9c873f1..c5b71ead80 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -16,79 +16,11 @@ class InvertTest : public ::testing::TestWithParam<test_t>
   InvertTest() : param(GetParam()) { }
 };
 
-bool is_normal_residual(QudaInverterType type)
-{
-  switch (type) {
-  case QUDA_CGNR_INVERTER:
-  case QUDA_CA_CGNR_INVERTER: return true;
-  default: return false;
-  }
-}
-
-bool is_preconditioned_solve(QudaSolveType type)
-{
-  switch (type) {
-  case QUDA_DIRECT_PC_SOLVE:
-  case QUDA_NORMOP_PC_SOLVE: return true;
-  default: return false;
-  }
-}
-
-bool is_full_solution(QudaSolutionType type)
-{
-  switch (type) {
-  case QUDA_MAT_SOLUTION:
-  case QUDA_MATDAG_MAT_SOLUTION: return true;
-  default: return false;
-  }
-}
-
-bool is_normal_solve(test_t param)
-{
-  auto inv_type = ::testing::get<0>(param);
-  auto solve_type = ::testing::get<2>(param);
-
-  switch (solve_type) {
-  case QUDA_NORMOP_SOLVE:
-  case QUDA_NORMOP_PC_SOLVE: return true;
-  default:
-    switch (inv_type) {
-    case QUDA_CGNR_INVERTER:
-    case QUDA_CGNE_INVERTER:
-    case QUDA_CA_CGNR_INVERTER:
-    case QUDA_CA_CGNE_INVERTER: return true;
-    default: return false;
-    }
-  }
-}
-
-bool is_chiral(QudaDslashType type)
-{
-  switch (type) {
-  case QUDA_DOMAIN_WALL_DSLASH:
-  case QUDA_DOMAIN_WALL_4D_DSLASH:
-  case QUDA_MOBIUS_DWF_DSLASH:
-  case QUDA_MOBIUS_DWF_EOFA_DSLASH: return true;
-  default: return false;
-  }
-}
-
-bool support_solution_accumulator_pipeline(QudaInverterType type)
-{
-  switch (type) {
-  case QUDA_CG_INVERTER:
-  case QUDA_CA_CG_INVERTER:
-  case QUDA_CGNR_INVERTER:
-  case QUDA_CGNE_INVERTER:
-  case QUDA_PCG_INVERTER: return true;
-  default: return false;
-  }
-}
-
 bool skip_test(test_t param)
 {
   auto inverter_type = ::testing::get<0>(param);
   auto solution_type = ::testing::get<1>(param);
+  auto solve_type = ::testing::get<2>(param);
   auto prec_sloppy = ::testing::get<3>(param);
   auto multishift = ::testing::get<4>(param);
   auto solution_accumulator_pipeline = ::testing::get<5>(param);
@@ -102,7 +34,7 @@ bool skip_test(test_t param)
   if (prec_sloppy < prec_precondition) return true; // sloppy precision >= preconditioner precision
 
   // dwf-style solves must use a normal solver
-  if (is_chiral(dslash_type) && !is_normal_solve(param)) return true;
+  if (is_chiral(dslash_type) && !is_normal_solve(inverter_type, solve_type)) return true;
   // FIXME this needs to be added to dslash_reference.cpp
   if (is_chiral(dslash_type) && multishift > 1) return true;
   // FIXME this needs to be added to dslash_reference.cpp
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index 28a6a48141..c035013568 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -23,7 +23,7 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
         || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0)
       return true;
 
-    if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1))
+    if (is_laplace(dslash_type) && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1))
       return true;
 
     const std::array<bool, 16> partition_enabled {true, true, true,  false,  true,  false, false, false,
@@ -123,12 +123,12 @@ int main(int argc, char **argv)
 
   // Only these fermions are supported in this file
   if (is_laplace_enabled) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
+    if (is_laplace(dslash_type))
       errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+    if (!is_staggered(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
@@ -146,7 +146,7 @@ int main(int argc, char **argv)
       eps_naik = 0.0; // to avoid potential headaches
   }
 
-  if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat)
+  if (is_laplace(dslash_type) && dtest_type != dslash_test_type::Mat)
     errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str());
 
   int test_rc = RUN_ALL_TESTS();
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index 2883d29b64..0beb48f887 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -86,12 +86,12 @@ int main(int argc, char **argv)
 
   // Only these fermions are supported in this file
   if (is_laplace_enabled) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
+    if (is_laplace(dslash_type))
       errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+    if (!is_staggered(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
@@ -109,7 +109,7 @@ int main(int argc, char **argv)
       eps_naik = 0.0; // to avoid potential headaches
   }
 
-  if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat)
+  if (is_laplace(dslash_type) && dtest_type != dslash_test_type::Mat)
     errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str());
 
   int test_rc = RUN_ALL_TESTS();
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 39bdc09c7b..68b3c676b8 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -245,7 +245,7 @@ struct StaggeredDslashTestWrapper {
     cpuLong = GaugeField(cpuLongParam);
 
     // Override link reconstruct as appropriate for staggered or asqtad
-    if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) {
+    if (is_staggered(dslash_type)) {
       if (link_recon == QUDA_RECONSTRUCT_12) link_recon = QUDA_RECONSTRUCT_13;
       if (link_recon == QUDA_RECONSTRUCT_8) link_recon = QUDA_RECONSTRUCT_9;
     }
@@ -342,12 +342,12 @@ struct StaggeredDslashTestWrapper {
 
         host_timer.start();
 
-        if (dslash_type == QUDA_LAPLACE_DSLASH) {
+        if (is_laplace(dslash_type)) {
           switch (dtest_type) {
           case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break;
           default: errorQuda("Test type %d not defined on Laplace operator", static_cast<int>(dtest_type));
           }
-        } else {
+        } else if (is_staggered(dslash_type)) {
           switch (dtest_type) {
           case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break;
           case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break;
@@ -355,6 +355,8 @@ struct StaggeredDslashTestWrapper {
           case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinor); break;
           default: errorQuda("Test type %d not defined on staggered dslash", static_cast<int>(dtest_type));
           }
+        } else {
+          errorQuda("Invalid dslash type %d", dslash_type);
         }
 
         host_timer.stop();
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index d71bb7cfb2..70d70a5b77 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -274,12 +274,12 @@ int main(int argc, char **argv)
 
   // Only these fermions are supported in this file
   if (is_laplace_enabled) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
+    if (is_laplace(dslash_type))
       errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+    if (!is_staggered(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index 5c5f8e3890..376651c447 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -37,7 +37,7 @@ bool skip_test(test_t test_param)
     // matpc
 
     // this is only legal for the staggered and asqtad op
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+    if (!is_staggered(dslash_type))
       return true;
 
     // we can only compute the real part for Lanczos, and real or magnitude for Arnoldi
@@ -55,7 +55,7 @@ bool skip_test(test_t test_param)
     // matdag_mat
     
     // this is only legal for the staggered and asqtad op
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+    if (!is_staggered(dslash_type))
       return true;
 
     switch (eig_type) {
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 8f4236d16c..1c941a59a9 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -466,18 +466,18 @@ int main(int argc, char **argv)
 
   // Only these fermions are supported in this file
   if (is_laplace_enabled) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH)
+    if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
+    if (is_laplace(dslash_type))
       errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH)
+    if (!is_staggered(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
   // Need to add support for LAPLACE MG?
   if (inv_multigrid) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) {
+    if (!is_staggered(dslash_type)) {
       errorQuda("dslash_type %s not supported for multigrid preconditioner\n", get_dslash_str(dslash_type));
     }
   }
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 4617c1e85d..27369b4a2f 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -16,73 +16,6 @@ class StaggeredInvertTest : public ::testing::TestWithParam<test_t>
   StaggeredInvertTest() : param(GetParam()) { }
 };
 
-bool is_hermitian_solver(QudaInverterType type)
-{
-  switch(type) {
-  case QUDA_CG_INVERTER:
-  case QUDA_CA_CG_INVERTER: return true;
-  default: return false;
-  }
-}
-
-bool is_normal_residual(QudaInverterType type)
-{
-  switch (type) {
-  case QUDA_CGNR_INVERTER:
-  case QUDA_CA_CGNR_INVERTER: return true;
-  default: return false;
-  }
-}
-
-bool is_preconditioned_solve(QudaSolveType type)
-{
-  switch (type) {
-  case QUDA_DIRECT_PC_SOLVE:
-  case QUDA_NORMOP_PC_SOLVE: return true;
-  default: return false;
-  }
-}
-
-bool is_full_solution(QudaSolutionType type)
-{
-  switch (type) {
-  case QUDA_MAT_SOLUTION:
-  case QUDA_MATDAG_MAT_SOLUTION: return true;
-  default: return false;
-  }
-}
-
-bool is_normal_solve(test_t param)
-{
-  auto inv_type = ::testing::get<0>(param);
-  auto solve_type = ::testing::get<2>(param);
-
-  switch (solve_type) {
-  case QUDA_NORMOP_SOLVE:
-  case QUDA_NORMOP_PC_SOLVE: return true;
-  default:
-    switch (inv_type) {
-    case QUDA_CGNR_INVERTER:
-    case QUDA_CGNE_INVERTER:
-    case QUDA_CA_CGNR_INVERTER:
-    case QUDA_CA_CGNE_INVERTER: return true;
-    default: return false;
-    }
-  }
-}
-
-bool support_solution_accumulator_pipeline(QudaInverterType type)
-{
-  switch (type) {
-  case QUDA_CG_INVERTER:
-  case QUDA_CA_CG_INVERTER:
-  case QUDA_CGNR_INVERTER:
-  case QUDA_CGNE_INVERTER:
-  case QUDA_PCG_INVERTER: return true;
-  default: return false;
-  }
-}
-
 bool skip_test(test_t param)
 {
   auto inverter_type = ::testing::get<0>(param);
@@ -106,12 +39,12 @@ bool skip_test(test_t param)
   //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ)
   //  if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true;
 
-  if (dslash_type == QUDA_LAPLACE_DSLASH) {
+  if (is_laplace(dslash_type)) {
     if (multishift > 1) return true; // Laplace doesn't support multishift
     if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE) return true; // Laplace only supports direct solves
   }
 
-  if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) {
+  if (is_staggered(dslash_type)) {
     // the staggered and asqtad operators aren't HPD
     if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type)) return true;
 
@@ -131,8 +64,6 @@ TEST_P(StaggeredInvertTest, verify)
 {
   if (skip_test(GetParam())) GTEST_SKIP();
 
-  auto tol_backup = tol;
-
   inv_param.tol = 0.0;
   inv_param.tol_hq = 0.0;
   auto res_t = ::testing::get<7>(GetParam());
@@ -143,18 +74,21 @@ TEST_P(StaggeredInvertTest, verify)
   auto solution_type = ::testing::get<1>(param);
   auto solve_type = ::testing::get<2>(param);
 
+  // Make a local copy of "tol" for modification in place
+  auto verify_tol = tol;
+
   // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this
   // The mass squared is a proxy for the condition number
-  if (is_normal_residual(inverter_type)) tol /= (0.25 * mass * mass);
+  if (is_normal_residual(inverter_type)) verify_tol /= (0.25 * mass * mass);
 
   // To solve the direct operator to a given tolerance, grind the preconditioned
   // operator to 0.5 * mass * tol... to keep the target tolerance in inv_param
   // in check, we shift the requirement to the verified tolerance instead.
   if (solution_type == QUDA_MAT_SOLUTION) {
     if (solve_type == QUDA_DIRECT_PC_SOLVE)
-      tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps
+      verify_tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps
     if (solve_type == QUDA_NORMOP_SOLVE)
-      tol /= (0.5 * mass); // a proxy for the condition number
+      verify_tol /= (0.5 * mass); // a proxy for the condition number
   }
 
   // The power iterations method of determining the Chebyshev window
@@ -163,17 +97,16 @@ TEST_P(StaggeredInvertTest, verify)
   if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER)
     inv_param.ca_basis = QUDA_POWER_BASIS;
 
-  // Single precision needs a tiny bump
+  // Single precision needs a tiny bump due to small host/device precision deviations
   if (prec == QUDA_SINGLE_PRECISION)
-    tol *= 1.01;
+    verify_tol *= 1.01;
 
   for (auto rsd : solve(GetParam())) {
-    if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); }
+    if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], verify_tol); }
     if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); }
   }
 
   inv_param.ca_basis = ca_basis_tmp;
-  tol = tol_backup;
 }
 
 std::string gettestname(::testing::TestParamInfo<test_t> param)
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 70aea9cdc2..f17622499b 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -231,7 +231,7 @@ void constructWilsonTestSpinorParam(quda::ColorSpinorParam *cs_param, const Quda
   }
   cs_param->pc_type = inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC;
   for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d];
-  bool pc = isPCSolution(inv_param->solution_type);
+  bool pc = is_pc_solution(inv_param->solution_type);
   if (pc) cs_param->x[0] /= 2;
   cs_param->siteSubset = pc ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET;
 
@@ -257,15 +257,130 @@ void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision p
   param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
   param.nDim = nDim;
   param.pc_type = QUDA_4D_PC;
-  param.siteSubset = isPCSolution(sol_type) ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET;
+  param.siteSubset = is_pc_solution(sol_type) ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET;
   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
   param.location = QUDA_CPU_FIELD_LOCATION; // DMH FIXME so one can construct device noise
   for (int d = 0; d < nDim; d++) param.x[d] = x[d];
-  if (isPCSolution(sol_type)) param.x[0] /= 2;
+  if (is_pc_solution(sol_type)) param.x[0] /= 2;
   quda::ColorSpinorField spinor_in(param);
   quda::spinorNoise(spinor_in, rng, QUDA_NOISE_UNIFORM);
 }
 
+// Helper functions
+bool is_pc_solution(QudaSolutionType type)
+{
+  switch (type) {
+  case QUDA_MATPC_SOLUTION:
+  case QUDA_MATPC_DAG_SOLUTION:
+  case QUDA_MATPCDAG_MATPC_SOLUTION:
+  case QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION: return true;
+  default: return false;
+  }
+}
+
+bool is_full_solution(QudaSolutionType type)
+{
+  switch (type) {
+  case QUDA_MAT_SOLUTION:
+  case QUDA_MATDAG_MAT_SOLUTION: return true;
+  default: return false;
+  }
+}
+
+bool is_full_solve(QudaSolveType type)
+{
+  switch (type) {
+  case QUDA_DIRECT_SOLVE:
+  case QUDA_NORMOP_SOLVE:
+  case QUDA_NORMERR_SOLVE: return true;
+  default: return false;
+  }
+}
+
+bool is_preconditioned_solve(QudaSolveType type)
+{
+  switch (type) {
+  case QUDA_DIRECT_PC_SOLVE:
+  case QUDA_NORMOP_PC_SOLVE:
+  case QUDA_NORMERR_PC_SOLVE: return true;
+  default: return false;
+  }
+}
+
+bool is_normal_solve(QudaInverterType inv_type, QudaSolveType solve_type)
+{
+  switch (solve_type) {
+  case QUDA_NORMOP_SOLVE:
+  case QUDA_NORMOP_PC_SOLVE: return true;
+  default:
+    switch (inv_type) {
+    case QUDA_CGNR_INVERTER:
+    case QUDA_CGNE_INVERTER:
+    case QUDA_CA_CGNR_INVERTER:
+    case QUDA_CA_CGNE_INVERTER: return true;
+    default: return false;
+    }
+  }
+}
+
+bool is_hermitian_solver(QudaInverterType type)
+{
+  switch(type) {
+  case QUDA_CG_INVERTER:
+  case QUDA_CA_CG_INVERTER: return true;
+  default: return false;
+  }
+}
+
+bool support_solution_accumulator_pipeline(QudaInverterType type)
+{
+  switch (type) {
+  case QUDA_CG_INVERTER:
+  case QUDA_CA_CG_INVERTER:
+  case QUDA_CGNR_INVERTER:
+  case QUDA_CGNE_INVERTER:
+  case QUDA_PCG_INVERTER: return true;
+  default: return false;
+  }
+}
+
+bool is_normal_residual(QudaInverterType type)
+{
+  switch (type) {
+  case QUDA_CGNR_INVERTER:
+  case QUDA_CA_CGNR_INVERTER: return true;
+  default: return false;
+  }
+}
+
+bool is_staggered(QudaDslashType type)
+{
+  switch (type) {
+  case QUDA_STAGGERED_DSLASH:
+  case QUDA_ASQTAD_DSLASH: return true;
+  default: return false;
+  }
+}
+
+bool is_chiral(QudaDslashType type)
+{
+  switch (type) {
+  case QUDA_DOMAIN_WALL_DSLASH:
+  case QUDA_DOMAIN_WALL_4D_DSLASH:
+  case QUDA_MOBIUS_DWF_DSLASH:
+  case QUDA_MOBIUS_DWF_EOFA_DSLASH: return true;
+  default: return false;
+  }
+}
+
+bool is_laplace(QudaDslashType type)
+{
+  switch (type) {
+  case QUDA_LAPLACE_DSLASH: return true;
+  default: return false;
+  }
+}
+
 void initComms(int argc, char **argv, std::array<int, 4> &commDims) { initComms(argc, argv, commDims.data()); }
 
 #if defined(QMP_COMMS) || defined(MPI_COMMS)
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index f5276e26f1..46ba4e715f 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -117,11 +117,20 @@ void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision p
 
 // Helper functions
 //------------------------------------------------------
-inline bool isPCSolution(QudaSolutionType solution_type)
-{
-  return (solution_type == QUDA_MATPC_SOLUTION || solution_type == QUDA_MATPC_DAG_SOLUTION
-          || solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
-}
+bool is_pc_solution(QudaSolutionType solution_type);
+bool is_full_solution(QudaSolutionType type);
+
+bool is_preconditioned_solve(QudaSolveType type);
+bool is_normal_solve(QudaInverterType inv_type, QudaSolveType solve_type);
+
+bool is_hermitian_solver(QudaInverterType type);
+bool support_solution_accumulator_pipeline(QudaInverterType type);
+bool is_normal_residual(QudaInverterType type);
+
+bool is_staggered(QudaDslashType type);
+bool is_chiral(QudaDslashType type);
+bool is_laplace(QudaDslashType type);
+
 //------------------------------------------------------
 
 // Reports basic statistics of flops and solver iterations
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index 95efb0d85a..fcc0b1d697 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -815,7 +815,7 @@ void constructStaggeredTestSpinorParam(quda::ColorSpinorParam *cs_param, const Q
   cs_param->nSpin = 1;
   cs_param->nDim = 4;
   for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d];
-  bool pc = isPCSolution(inv_param->solution_type);
+  bool pc = is_pc_solution(inv_param->solution_type);
   if (pc) cs_param->x[0] /= 2;
   cs_param->pc_type = QUDA_4D_PC;
   cs_param->siteSubset = pc ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET;

From bf4eaad79bdecfd678a308ecd790b8a27efc1a06 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 7 Dec 2023 10:41:25 -0800
Subject: [PATCH 30/53] Wilson-type compile fix

---
 tests/invert_test_gtest.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index c5b71ead80..47cc230fe2 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -42,7 +42,7 @@ bool skip_test(test_t param)
   // Skip if the inverter does not support batched update and batched update is greater than one
   if (!support_solution_accumulator_pipeline(inverter_type) && solution_accumulator_pipeline > 1) return true;
   // MdagMLocal only support for Mobius at present
-  if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) {
+  if (is_normal_solve(inverter_type, solve_type) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) {
 #ifdef QUDA_MMA_AVAILABLE
     if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true;
 #else

From b4300ccc8db11c8938cf375aa21d605bb57ea016 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 7 Dec 2023 11:29:44 -0800
Subject: [PATCH 31/53] Changed dwf tolerance check to use is_chiral

---
 tests/invert_test_gtest.hpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index 0bbc4f6926..dca4bc5e9a 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -69,10 +69,7 @@ TEST_P(InvertTest, verify)
   if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq;
 
   auto tol = inv_param.tol;
-  if (inv_param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
-    inv_param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
-    inv_param.dslash_type == QUDA_MOBIUS_DWF_DSLASH ||
-    inv_param.dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) {
+  if (is_chiral(inv_param.dslash_type)) {
     tol *= std::sqrt(static_cast<double>(inv_param.Ls));
   }
   // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this

From ca2be8d60eb880a5067a7a1f5d9a4e6a96fc7b3f Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 11 Dec 2023 17:43:37 -0800
Subject: [PATCH 32/53] Some BiCGStab cleanup, SVD deflation is being quirky

---
 lib/inv_bicgstab_quda.cpp | 69 ++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 30 deletions(-)

diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index 10ec609ec3..2822867667 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -41,7 +41,26 @@ namespace quda {
 
   void BiCGstab::operator()(ColorSpinorField &x, ColorSpinorField &b)
   {
-    profile.TPSTART(QUDA_PROFILE_PREAMBLE);
+    if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
+
+    double b2 = blas::norm2(b); // norm sq of source
+    double r2;                  // norm sq of residual
+
+    // Check to see that we're not trying to invert on a zero-field source
+    if (b2 == 0) {
+      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) {
+        warningQuda("inverting on zero-field source");
+        x = b;
+        param.true_res = 0.0;
+        param.true_res_hq = 0.0;
+        profile.TPSTOP(QUDA_PROFILE_INIT);
+        return;
+      } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
+        b2 = r2;
+      } else {
+        errorQuda("Null vector computing requires non-zero guess!");
+      }
+    }
 
     if (!init) {
       ColorSpinorParam csParam(x);
@@ -56,9 +75,6 @@ namespace quda {
       init = true;
     }
 
-    double b2 = blas::norm2(b); // norm sq of source
-    double r2;                  // norm sq of residual
-
     if (param.deflate) {
       // Construct the eigensolver and deflation space if requested.
       if (param.eig_param.eig_type == QUDA_EIG_TR_LANCZOS || param.eig_param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) {
@@ -70,15 +86,15 @@ namespace quda {
       }
       if (deflate_compute) {
         // compute the deflation space.
-        if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
+        if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT);
         (*eig_solve)(evecs, evals);
+        if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
         if (param.deflate) {
           // double the size of the Krylov space
           extendSVDDeflationSpace();
           // populate extra memory with L/R singular vectors
           eig_solve->computeSVD(evecs, evals);
         }
-        if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_PREAMBLE);
         deflate_compute = false;
       }
       if (recompute_evals) {
@@ -108,22 +124,6 @@ namespace quda {
       r2 = blas::xmyNorm(b, r);
     }
 
-    // Check to see that we're not trying to invert on a zero-field source
-    if (b2 == 0) {
-      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) {
-        warningQuda("inverting on zero-field source");
-        x = b;
-        param.true_res = 0.0;
-        param.true_res_hq = 0.0;
-        profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
-        return;
-      } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
-        b2 = r2;
-      } else {
-        errorQuda("Null vector computing requires non-zero guess!");
-      }
-    }
-
     // set field aliasing according to whether we are doing mixed precision or not
     if (param.precision_sloppy == x.Precision()) {
       r_sloppy = r.create_alias();
@@ -156,6 +156,11 @@ namespace quda {
       x_sloppy = ColorSpinorField(csParam);
     }
 
+    if (!param.is_preconditioner) {
+      profile.TPSTOP(QUDA_PROFILE_INIT);
+      profile.TPSTART(QUDA_PROFILE_PREAMBLE);
+    }
+
     double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver
 
     const bool use_heavy_quark_res =
@@ -184,8 +189,10 @@ namespace quda {
 
     PrintStats("BiCGstab", k, r2, b2, heavy_quark_res);
 
-    profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
-    profile.TPSTART(QUDA_PROFILE_COMPUTE);
+    if (!param.is_preconditioner) {
+      profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
+      profile.TPSTART(QUDA_PROFILE_COMPUTE);
+    }
 
     rho = r2; // cDotProductCuda(r0, r_sloppy); // BiCRstab
     blas::copy(p, r_sloppy);
@@ -349,12 +356,14 @@ namespace quda {
     // y has already been updated
     blas::copy(x, y);
 
-    profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-    profile.TPSTART(QUDA_PROFILE_EPILOGUE);
+    if (!param.is_preconditioner) {
+      profile.TPSTOP(QUDA_PROFILE_COMPUTE);
+      profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.iter += k;
+      param.iter += k;
 
-    if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
+      if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
+    }
 
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("BiCGstab: Reliable updates = %d\n", rUpdate);
 
@@ -364,9 +373,9 @@ namespace quda {
       param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r).z) : 0.0;
 
       PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq);
-    }
 
-    profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
+      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
+    }
 
   }
 

From 6bcbdab3c23fdf7c2bcd7c8007c02a687a2fbb9c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 12 Dec 2023 10:35:29 -0800
Subject: [PATCH 33/53] Added an asqtad splitgrid test to probe loading both
 fat and fat+long links appropriately, fixed a staggered split grid bug

---
 lib/interface_quda.cpp              | 80 ++++++++++++++++++-----------
 tests/CMakeLists.txt                | 42 ++++++++++-----
 tests/staggered_dslash_test_utils.h |  2 +-
 3 files changed, 80 insertions(+), 44 deletions(-)

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index c9576f6f01..582f5031f2 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3010,7 +3010,7 @@ void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_para
 template <class Interface, class... Args>
 void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // color spinor field pointers, and inv_param
                       void *h_gauge, void *milc_fatlinks, void *milc_longlinks,
-                      QudaGaugeParam *gauge_param,     // gauge field pointers
+                      QudaGaugeParam *gauge_param_,     // gauge field pointers
                       void *h_clover, void *h_clovinv, // clover field pointers
                       Interface op, Args... args)
 {
@@ -3030,14 +3030,19 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]);
   }
 
+  // Create a local copy of gauge_param that we can modify without perturbing
+  // the original one
+  if (!gauge_param_)
+    errorQuda("Input gauge_param is null");
+
+  QudaGaugeParam gauge_param = *gauge_param_;
+
   if (num_sub_partition == 1) { // In this case we don't split the grid.
 
     for (int n = 0; n < param->num_src; n++) { op(_hp_x[n], _hp_b[n], param, args...); }
 
   } else {
 
-    if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr"); }
-
     // Doing the sub-partition arithmatics
     if (param->num_src_per_sub_partition * num_sub_partition != param->num_src) {
       errorQuda("We need to have split_grid[0](=%d) * split_grid[1](=%d) * split_grid[2](=%d) * split_grid[3](=%d) * "
@@ -3054,14 +3059,19 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
     checkInvertParam(param, _hp_x[0], _hp_b[0]);
 
-    bool is_staggered;
+    bool is_staggered = false;
+    bool is_asqtad = false;
     if (h_gauge) {
       is_staggered = false;
     } else if (milc_fatlinks) {
       is_staggered = true;
+      if (param->dslash_type == QUDA_ASQTAD_DSLASH) {
+        if (!milc_longlinks)
+          errorQuda("milc_longlinks is null for an asqtad dslash");
+        is_asqtad = true;
+      }
     } else {
       errorQuda("Both h_gauge and milc_fatlinks are null.");
-      is_staggered = true; // to suppress compiler warning/error.
     }
 
     // Gauge fields/params
@@ -3075,23 +3085,28 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
     // set up the gauge field params.
     if (!is_staggered) { // not staggered
-      gf_param = new GaugeFieldParam(*gauge_param, h_gauge);
+      gf_param = new GaugeFieldParam(gauge_param, h_gauge);
       if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       in = GaugeField::Create(*gf_param);
     } else { // staggered
-      milc_fatlink_param = new GaugeFieldParam(*gauge_param, milc_fatlinks);
+      milc_fatlink_param = new GaugeFieldParam(gauge_param, milc_fatlinks);
       if (milc_fatlink_param->order <= 4) milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
+      milc_fatlink_param->order = QUDA_MILC_GAUGE_ORDER;
       milc_fatlink_field = GaugeField::Create(*milc_fatlink_param);
-      milc_longlink_param = new GaugeFieldParam(*gauge_param, milc_longlinks);
-      if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-      milc_longlink_field = GaugeField::Create(*milc_longlink_param);
+
+      if (is_asqtad) {
+        milc_longlink_param = new GaugeFieldParam(gauge_param, milc_longlinks);
+        if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
+        milc_longlink_param->order = QUDA_MILC_GAUGE_ORDER;
+        milc_longlink_field = GaugeField::Create(*milc_longlink_param);
+      }
     }
 
     // Create the temp host side helper fields, which are just wrappers of the input pointers.
     bool pc_solution
       = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
 
-    lat_dim_t X = {gauge_param->X[0], gauge_param->X[1], gauge_param->X[2], gauge_param->X[3]};
+    lat_dim_t X = {gauge_param.X[0], gauge_param.X[1], gauge_param.X[2], gauge_param.X[3]};
     ColorSpinorParam cpuParam(_hp_b[0], *param, X, pc_solution, param->input_location);
     std::vector<ColorSpinorField *> _h_b(param->num_src);
     for (int i = 0; i < param->num_src; i++) {
@@ -3119,12 +3134,12 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
         gf_param->pad *= split_key[d];
       } else {
         milc_fatlink_param->x[d] *= split_key[d];
-        milc_fatlink_param->pad *= split_key[d];
-        milc_longlink_param->x[d] *= split_key[d];
-        milc_longlink_param->pad *= split_key[d];
+        //milc_fatlink_param->pad *= split_key[d];
+        if (is_asqtad) milc_longlink_param->x[d] *= split_key[d];
+        //milc_longlink_param->pad *= split_key[d];
       }
-      gauge_param->X[d] *= split_key[d];
-      gauge_param->ga_pad *= split_key[d];
+      gauge_param.X[d] *= split_key[d];
+      if (!is_staggered) gauge_param.ga_pad *= split_key[d];
     }
 
     // Deal with clover field. For Multi source computatons, clover field construction is done
@@ -3178,15 +3193,19 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       v_g[0] = in;
       quda::split_field(*collected_gauge, v_g, split_key);
     } else {
+      std::vector<quda::GaugeField *> v_g(1);
+
       milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE;
-      milc_longlink_param->create = QUDA_NULL_FIELD_CREATE;
       collected_milc_fatlink_field = new quda::GaugeField(*milc_fatlink_param);
-      collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param);
-      std::vector<quda::GaugeField *> v_g(1);
       v_g[0] = milc_fatlink_field;
       quda::split_field(*collected_milc_fatlink_field, v_g, split_key);
-      v_g[0] = milc_longlink_field;
-      quda::split_field(*collected_milc_longlink_field, v_g, split_key);
+
+      if (is_asqtad) {
+        milc_longlink_param->create = QUDA_NULL_FIELD_CREATE;
+        collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param);
+        v_g[0] = milc_longlink_field;
+        quda::split_field(*collected_milc_longlink_field, v_g, split_key);
+      }
     }
 
     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_PREAMBLE);
@@ -3219,9 +3238,9 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     // the split topology.
     logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n");
     if (!is_staggered) {
-      loadGaugeQuda(collected_gauge->raw_pointer(), gauge_param);
+      loadGaugeQuda(collected_gauge->raw_pointer(), &gauge_param);
     } else {
-      loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->raw_pointer(),
+      loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field->raw_pointer(),
                            collected_milc_longlink_field->raw_pointer());
     }
     logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n");
@@ -3247,8 +3266,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     comm_barrier();
 
     for (int d = 0; d < CommKey::n_dim; d++) {
-      gauge_param->X[d] /= split_key[d];
-      gauge_param->ga_pad /= split_key[d];
+      gauge_param.X[d] /= split_key[d];
+      if (!is_staggered) gauge_param.ga_pad /= split_key[d];
     }
 
     for (int n = 0; n < param->num_src_per_sub_partition; n++) {
@@ -3269,9 +3288,12 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       delete collected_gauge;
     } else {
       delete milc_fatlink_field;
-      delete milc_longlink_field;
       delete collected_milc_fatlink_field;
-      delete collected_milc_longlink_field;
+
+      if (is_asqtad) {
+        delete milc_longlink_field;
+        delete collected_milc_longlink_field;
+      }
     }
 
     if (input_clover) { delete input_clover; }
@@ -3281,10 +3303,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
     // Restore the gauge field
     if (!is_staggered) {
-      loadGaugeQuda(h_gauge, gauge_param);
+      loadGaugeQuda(h_gauge, &gauge_param);
     } else {
       freeGaugeQuda();
-      loadFatLongGaugeQuda(param, gauge_param, milc_fatlinks, milc_longlinks);
+      loadFatLongGaugeQuda(param, &gauge_param, milc_fatlinks, milc_longlinks);
     }
 
     if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 923df3d04c..cb87adf27f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -871,7 +871,7 @@ endif()
     add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2}
     COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
             --dslash-type ${DIRAC_NAME}
-            --test 0
+            --test MatPC
             --dim 20 20 20 20
             --gtest_output=json:dslash_${DIRAC_NAME}_benchmark_pol${pol2}.json
             --gtest_filter=*benchmark/*n0)
@@ -907,7 +907,7 @@ endif()
     COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
             --dslash-type ${DIRAC_NAME}
             --all-partitions 1
-            --test 0
+            --test MatPC
             --dim 20 20 20 20
             --gtest_output=json:dslash_${DIRAC_NAME}_benchmark_pol${pol2}.json
             --gtest_filter=*benchmark/*n0)
@@ -1161,19 +1161,19 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true
       --gtest_output=xml:invert_test_staggered_${prec}.xml)
 
-      if(DEFINED ENV{QUDA_ENABLE_TUNING})
-        if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-          add_test(NAME invert_test_splitgrid_staggered_${prec}
-            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-            --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
-            --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-            --nsrc ${QUDA_TEST_NUM_PROCS}
-            --enable-testing true
-            --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
-
-          set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-        endif()
+    if(DEFINED ENV{QUDA_ENABLE_TUNING})
+      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
+        add_test(NAME invert_test_splitgrid_staggered_${prec}
+          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+          --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
+          --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+          --nsrc ${QUDA_TEST_NUM_PROCS}
+          --enable-testing true
+          --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
+
+        set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
       endif()
+    endif()
 
     add_test(NAME invert_test_asqtad_${prec}
       COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
@@ -1182,6 +1182,20 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true
       --gtest_output=xml:invert_test_asqtad_${prec}.xml)
 
+    if(DEFINED ENV{QUDA_ENABLE_TUNING})
+      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
+        add_test(NAME invert_test_splitgrid_asqtad_${prec}
+          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+          --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
+          --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+          --nsrc ${QUDA_TEST_NUM_PROCS}
+          --enable-testing true
+          --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml)
+
+        set_tests_properties(invert_test_splitgrid_asqtad_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+      endif()
+    endif()
+
     if (QUDA_LAPLACE)
       add_test(NAME invert_test_laplace_${prec}
         COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 68b3c676b8..e7eb39b07f 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -333,7 +333,7 @@ struct StaggeredDslashTestWrapper {
         _hp_x[i] = vp_spinor_out[i].data();
         _hp_b[i] = vp_spinor[i].data();
       }
-      dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, qdp_fatlink, qdp_longlink,
+      dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink, milc_longlink,
                                   &gauge_param);
 
     } else {

From 67b5ef37242363fc7b6b2b68552d500312e688ba Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 12 Dec 2023 14:09:42 -0800
Subject: [PATCH 34/53] Further split grid cleanup, some tolerance fixes

---
 lib/interface_quda.cpp                    | 61 +++++++++--------------
 tests/CMakeLists.txt                      |  4 +-
 tests/staggered_eigensolve_test.cpp       |  4 +-
 tests/staggered_eigensolve_test_gtest.hpp |  7 +++
 4 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 582f5031f2..b42e4ba806 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3034,7 +3034,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
   // the original one
   if (!gauge_param_)
     errorQuda("Input gauge_param is null");
-
   QudaGaugeParam gauge_param = *gauge_param_;
 
   if (num_sub_partition == 1) { // In this case we don't split the grid.
@@ -3078,10 +3077,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     GaugeFieldParam *gf_param = nullptr;
     GaugeField *in = nullptr;
     // Staggered gauge fields/params
-    GaugeFieldParam *milc_fatlink_param = nullptr;
-    GaugeFieldParam *milc_longlink_param = nullptr;
-    GaugeField *milc_fatlink_field = nullptr;
-    GaugeField *milc_longlink_field = nullptr;
+    GaugeFieldParam milc_fatlink_param;
+    GaugeFieldParam milc_longlink_param;
+    quda::GaugeField milc_fatlink_field;
+    quda::GaugeField milc_longlink_field;
 
     // set up the gauge field params.
     if (!is_staggered) { // not staggered
@@ -3089,16 +3088,14 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       in = GaugeField::Create(*gf_param);
     } else { // staggered
-      milc_fatlink_param = new GaugeFieldParam(gauge_param, milc_fatlinks);
-      if (milc_fatlink_param->order <= 4) milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-      milc_fatlink_param->order = QUDA_MILC_GAUGE_ORDER;
-      milc_fatlink_field = GaugeField::Create(*milc_fatlink_param);
+      milc_fatlink_param = GaugeFieldParam(gauge_param, milc_fatlinks);
+      milc_fatlink_param.order = QUDA_MILC_GAUGE_ORDER;
+      milc_fatlink_field = GaugeField(milc_fatlink_param);
 
       if (is_asqtad) {
-        milc_longlink_param = new GaugeFieldParam(gauge_param, milc_longlinks);
-        if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-        milc_longlink_param->order = QUDA_MILC_GAUGE_ORDER;
-        milc_longlink_field = GaugeField::Create(*milc_longlink_param);
+        milc_longlink_param = GaugeFieldParam(gauge_param, milc_longlinks);
+        milc_longlink_param.order = QUDA_MILC_GAUGE_ORDER;
+        milc_longlink_field = GaugeField(milc_longlink_param);
       }
     }
 
@@ -3133,10 +3130,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
         gf_param->x[d] *= split_key[d];
         gf_param->pad *= split_key[d];
       } else {
-        milc_fatlink_param->x[d] *= split_key[d];
-        //milc_fatlink_param->pad *= split_key[d];
-        if (is_asqtad) milc_longlink_param->x[d] *= split_key[d];
-        //milc_longlink_param->pad *= split_key[d];
+        milc_fatlink_param.x[d] *= split_key[d];
+        if (is_asqtad) milc_longlink_param.x[d] *= split_key[d];
       }
       gauge_param.X[d] *= split_key[d];
       if (!is_staggered) gauge_param.ga_pad *= split_key[d];
@@ -3183,8 +3178,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
 
     quda::GaugeField *collected_gauge = nullptr;
-    quda::GaugeField *collected_milc_fatlink_field = nullptr;
-    quda::GaugeField *collected_milc_longlink_field = nullptr;
+    quda::GaugeField collected_milc_fatlink_field;
+    quda::GaugeField collected_milc_longlink_field;
 
     if (!is_staggered) {
       gf_param->create = QUDA_NULL_FIELD_CREATE;
@@ -3195,16 +3190,16 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     } else {
       std::vector<quda::GaugeField *> v_g(1);
 
-      milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE;
-      collected_milc_fatlink_field = new quda::GaugeField(*milc_fatlink_param);
-      v_g[0] = milc_fatlink_field;
-      quda::split_field(*collected_milc_fatlink_field, v_g, split_key);
+      milc_fatlink_param.create = QUDA_NULL_FIELD_CREATE;
+      collected_milc_fatlink_field = GaugeField(milc_fatlink_param);
+      v_g[0] = &milc_fatlink_field;
+      quda::split_field(collected_milc_fatlink_field, v_g, split_key);
 
       if (is_asqtad) {
-        milc_longlink_param->create = QUDA_NULL_FIELD_CREATE;
-        collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param);
-        v_g[0] = milc_longlink_field;
-        quda::split_field(*collected_milc_longlink_field, v_g, split_key);
+        milc_longlink_param.create = QUDA_NULL_FIELD_CREATE;
+        collected_milc_longlink_field = GaugeField(milc_longlink_param);
+        v_g[0] = &milc_longlink_field;
+        quda::split_field(collected_milc_longlink_field, v_g, split_key);
       }
     }
 
@@ -3240,8 +3235,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     if (!is_staggered) {
       loadGaugeQuda(collected_gauge->raw_pointer(), &gauge_param);
     } else {
-      loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field->raw_pointer(),
-                           collected_milc_longlink_field->raw_pointer());
+      loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field.raw_pointer(),
+        (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr);
     }
     logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n");
 
@@ -3286,14 +3281,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     if (!is_staggered) {
       delete in;
       delete collected_gauge;
-    } else {
-      delete milc_fatlink_field;
-      delete collected_milc_fatlink_field;
-
-      if (is_asqtad) {
-        delete milc_longlink_field;
-        delete collected_milc_longlink_field;
-      }
     }
 
     if (input_clover) { delete input_clover; }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cb87adf27f..e325e5c6be 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1379,9 +1379,9 @@ foreach(prec IN LISTS TEST_PRECS)
 
   # These require looser tolerances to keep iterations to solution in check
   if(${prec} STREQUAL "double")
-    set(tol 1e-6)
-  elseif(${prec} STREQUAL "single")
     set(tol 1e-5)
+  elseif(${prec} STREQUAL "single")
+    set(tol 1e-4)
   endif()
 
   if(QUDA_DIRAC_STAGGERED)
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 70d70a5b77..85136b3972 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -174,7 +174,7 @@ std::vector<double> eigensolve(test_t test_param)
   if (enable_testing) {
     eig_use_poly_acc = false;
     eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE;
-    eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 4;
+    eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 0;
   }
 
   logQuda(QUDA_SUMMARIZE, "Action = %s, Solver = %s, norm-op = %s, even-odd = %s, with SVD = %s, spectrum = %s\n",
@@ -307,7 +307,7 @@ int main(int argc, char **argv)
     if (eig_tol != expected_tol) { eig_tol = expected_tol; changes = true; }
     if (niter != 1000) { niter = 1000; changes = true; }
     if (eig_n_kr != 256) { eig_n_kr = 256; changes = true; }
-    if (eig_block_size != 8) { eig_block_size = 8; }
+    if (eig_block_size != 4) { eig_block_size = 4; }
 
     if (changes) {
       printfQuda("For gtest, various defaults are changed:\n");
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index 376651c447..ab879a9b37 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -102,6 +102,13 @@ bool skip_test(test_t test_param)
     }
   }
 
+  // There seems to be some stubborn issue with this combination on 2xGPUs that I can't quite
+  // comprehend, and am a bit tired to debugging.
+  //if (prec == QUDA_SINGLE_PRECISION && dslash_type == QUDA_ASQTAD_DSLASH &&
+  //    eig_type == QUDA_EIG_BLK_TR_LANCZOS && spectrum == QUDA_SPECTRUM_SR_EIG &&
+  //    combo_solve_type == QUDA_DIRECT_PC_SOLVE)
+  //    return true;
+
   return false;
 }
 

From 19d6f02905feb01e74fc73dec24540ddb878647a Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 12 Dec 2023 15:49:05 -0800
Subject: [PATCH 35/53] Potential logic in (block) trlm related to using a max
 norm for getting the smallest eigenvalues w/out polynomial acceleration...
 and fixing an initialized variable issue

---
 lib/eig_block_trlm.cpp    | 14 +++++++-------
 lib/eig_trlm.cpp          | 16 ++++++++--------
 lib/inv_bicgstab_quda.cpp |  2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp
index 160af9dff4..0cf7564440 100644
--- a/lib/eig_block_trlm.cpp
+++ b/lib/eig_block_trlm.cpp
@@ -83,7 +83,7 @@ namespace quda
     checkChebyOpMax(kSpace);
 
     // Convergence and locking criteria
-    double mat_norm = 0.0;
+    //double mat_norm = 0.0;
     double epsilon = setEpsilon(kSpace[0].Precision());
 
     // Print Eigensolver params
@@ -106,15 +106,15 @@ namespace quda
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
       // mat_norm is updated.
-      for (int i = num_locked; i < n_kr; i++)
-        if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
+      //for (int i = num_locked; i < n_kr; i++)
+      //  if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
 
       // Locking check
       iter_locked = 0;
       for (int i = 1; i < (n_kr - num_locked); i++) {
-        if (residua[i + num_locked] < epsilon * mat_norm) {
+        if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  epsilon * mat_norm);
+                  epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/);
           iter_locked = i;
         } else {
           // Unlikely to find new locked pairs
@@ -125,9 +125,9 @@ namespace quda
       // Convergence check
       iter_converged = iter_locked;
       for (int i = iter_locked + 1; i < n_kr - num_locked; i++) {
-        if (residua[i + num_locked] < tol * mat_norm) {
+        if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  tol * mat_norm);
+                  tol * fabs(alpha[i + num_locked]) /*mat_norm*/);
           iter_converged = i;
         } else {
           // Unlikely to find new converged pairs
diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp
index 00d3941527..eaea8d8560 100644
--- a/lib/eig_trlm.cpp
+++ b/lib/eig_trlm.cpp
@@ -64,7 +64,7 @@ namespace quda
     checkChebyOpMax(kSpace);
 
     // Convergence and locking criteria
-    double mat_norm = 0.0;
+    //double mat_norm = 0.0;
     double epsilon = setEpsilon(kSpace[0].Precision());
 
     // Print Eigensolver params
@@ -87,15 +87,15 @@ namespace quda
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
       // mat_norm is updated.
-      for (int i = num_locked; i < n_kr; i++)
-        if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
+      //for (int i = num_locked; i < n_kr; i++)
+      //  if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
 
       // Locking check
       iter_locked = 0;
       for (int i = 1; i < (n_kr - num_locked); i++) {
-        if (residua[i + num_locked] < epsilon * mat_norm) {
+        if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked])/*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  epsilon * mat_norm);
+                  epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/);
           iter_locked = i;
         } else {
           // Unlikely to find new locked pairs
@@ -106,9 +106,9 @@ namespace quda
       // Convergence check
       iter_converged = iter_locked;
       for (int i = iter_locked + 1; i < n_kr - num_locked; i++) {
-        if (residua[i + num_locked] < tol * mat_norm) {
+        if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  tol * mat_norm);
+                  tol * fabs(alpha[i + num_locked]) /*mat_norm*/);
           iter_converged = i;
         } else {
           // Unlikely to find new converged pairs
@@ -166,7 +166,7 @@ namespace quda
               n_conv, restart_iter, iter);
 
       // Dump all Ritz values and residua if using Chebyshev
-      for (int i = 0; i < n_conv && eig_param->use_poly_acc; i++) {
+      for (int i = 0; i < n_conv /*&& eig_param->use_poly_acc*/; i++) {
         logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]);
       }
 
diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index 2822867667..3fa5afd849 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -44,7 +44,7 @@ namespace quda {
     if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
 
     double b2 = blas::norm2(b); // norm sq of source
-    double r2;                  // norm sq of residual
+    double r2 = 0.0;            // norm sq of residual
 
     // Check to see that we're not trying to invert on a zero-field source
     if (b2 == 0) {

From 862896eb31af4b8c1bae9bd8c8b9f82916079f40 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 12 Dec 2023 19:26:00 -0800
Subject: [PATCH 36/53] Restored norm behavior for (block)TRLM LR convergence

---
 lib/eig_block_trlm.cpp | 25 +++++++++++++++++--------
 lib/eig_trlm.cpp       | 25 +++++++++++++++++--------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp
index 0cf7564440..18df423ad4 100644
--- a/lib/eig_block_trlm.cpp
+++ b/lib/eig_block_trlm.cpp
@@ -83,7 +83,7 @@ namespace quda
     checkChebyOpMax(kSpace);
 
     // Convergence and locking criteria
-    //double mat_norm = 0.0;
+    double mat_norm = 0.0;
     double epsilon = setEpsilon(kSpace[0].Precision());
 
     // Print Eigensolver params
@@ -105,16 +105,25 @@ namespace quda
       eigensolveFromBlockArrowMat();
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
-      // mat_norm is updated.
-      //for (int i = num_locked; i < n_kr; i++)
-      //  if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
+      // mat_norm is updated and used for LR
+      for (int i = num_locked; i < n_kr; i++)
+        if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
+
+      // Lambda that returns mat_norm for LR and returns the relevant alpha
+      // (the corresponding Ritz value) for SR
+      auto check_norm = [&] (double sr_norm) -> double {
+        if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG)
+          return mat_norm;
+        else
+          return sr_norm;
+      };
 
       // Locking check
       iter_locked = 0;
       for (int i = 1; i < (n_kr - num_locked); i++) {
-        if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/) {
+        if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/);
+                  epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/);
           iter_locked = i;
         } else {
           // Unlikely to find new locked pairs
@@ -125,9 +134,9 @@ namespace quda
       // Convergence check
       iter_converged = iter_locked;
       for (int i = iter_locked + 1; i < n_kr - num_locked; i++) {
-        if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) {
+        if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  tol * fabs(alpha[i + num_locked]) /*mat_norm*/);
+                  tol * check_norm(alpha[i + num_locked]) /*mat_norm*/);
           iter_converged = i;
         } else {
           // Unlikely to find new converged pairs
diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp
index eaea8d8560..99e3b465b5 100644
--- a/lib/eig_trlm.cpp
+++ b/lib/eig_trlm.cpp
@@ -64,7 +64,7 @@ namespace quda
     checkChebyOpMax(kSpace);
 
     // Convergence and locking criteria
-    //double mat_norm = 0.0;
+    double mat_norm = 0.0;
     double epsilon = setEpsilon(kSpace[0].Precision());
 
     // Print Eigensolver params
@@ -86,16 +86,25 @@ namespace quda
       eigensolveFromArrowMat();
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
-      // mat_norm is updated.
-      //for (int i = num_locked; i < n_kr; i++)
-      //  if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
+      // mat_norm is updated and used for LR
+      for (int i = num_locked; i < n_kr; i++)
+        if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]);
+
+      // Lambda that returns mat_norm for LR and returns the relevant alpha
+      // (the corresponding Ritz value) for SR
+      auto check_norm = [&] (double sr_norm) -> double {
+        if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG)
+          return mat_norm;
+        else
+          return sr_norm;
+      };
 
       // Locking check
       iter_locked = 0;
       for (int i = 1; i < (n_kr - num_locked); i++) {
-        if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked])/*mat_norm*/) {
+        if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])/*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/);
+                  epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/);
           iter_locked = i;
         } else {
           // Unlikely to find new locked pairs
@@ -106,9 +115,9 @@ namespace quda
       // Convergence check
       iter_converged = iter_locked;
       for (int i = iter_locked + 1; i < n_kr - num_locked; i++) {
-        if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) {
+        if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  tol * fabs(alpha[i + num_locked]) /*mat_norm*/);
+                  tol * check_norm(alpha[i + num_locked]) /*mat_norm*/);
           iter_converged = i;
         } else {
           // Unlikely to find new converged pairs

From 5f901e4416e76b8f5fd564c19df051cd7e74d3c0 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 12 Dec 2023 20:46:38 -0800
Subject: [PATCH 37/53] Updated Wilson bits of split grid to use GaugeField
 objects as appropriate

---
 lib/interface_quda.cpp | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index b42e4ba806..7a8c4d8b45 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3074,8 +3074,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
 
     // Gauge fields/params
-    GaugeFieldParam *gf_param = nullptr;
-    GaugeField *in = nullptr;
+    GaugeFieldParam gf_param;
+    GaugeField in;
     // Staggered gauge fields/params
     GaugeFieldParam milc_fatlink_param;
     GaugeFieldParam milc_longlink_param;
@@ -3084,9 +3084,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
     // set up the gauge field params.
     if (!is_staggered) { // not staggered
-      gf_param = new GaugeFieldParam(gauge_param, h_gauge);
-      if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-      in = GaugeField::Create(*gf_param);
+      gf_param = GaugeFieldParam(gauge_param, h_gauge);
+      in = GaugeField(gf_param);
     } else { // staggered
       milc_fatlink_param = GaugeFieldParam(gauge_param, milc_fatlinks);
       milc_fatlink_param.order = QUDA_MILC_GAUGE_ORDER;
@@ -3127,8 +3126,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
         errorQuda("Split not possible: %2d %% %2d != 0", comm_dim(d), split_key[d]);
       }
       if (!is_staggered) {
-        gf_param->x[d] *= split_key[d];
-        gf_param->pad *= split_key[d];
+        gf_param.x[d] *= split_key[d];
+        gf_param.pad *= split_key[d];
       } else {
         milc_fatlink_param.x[d] *= split_key[d];
         if (is_asqtad) milc_longlink_param.x[d] *= split_key[d];
@@ -3177,16 +3176,16 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       }
     }
 
-    quda::GaugeField *collected_gauge = nullptr;
+    quda::GaugeField collected_gauge;
     quda::GaugeField collected_milc_fatlink_field;
     quda::GaugeField collected_milc_longlink_field;
 
     if (!is_staggered) {
-      gf_param->create = QUDA_NULL_FIELD_CREATE;
-      collected_gauge = new quda::GaugeField(*gf_param);
+      gf_param.create = QUDA_NULL_FIELD_CREATE;
+      collected_gauge = quda::GaugeField(gf_param);
       std::vector<quda::GaugeField *> v_g(1);
-      v_g[0] = in;
-      quda::split_field(*collected_gauge, v_g, split_key);
+      v_g[0] = &in;
+      quda::split_field(collected_gauge, v_g, split_key);
     } else {
       std::vector<quda::GaugeField *> v_g(1);
 
@@ -3233,7 +3232,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     // the split topology.
     logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n");
     if (!is_staggered) {
-      loadGaugeQuda(collected_gauge->raw_pointer(), &gauge_param);
+      loadGaugeQuda(collected_gauge.raw_pointer(), &gauge_param);
     } else {
       loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field.raw_pointer(),
         (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr);
@@ -3278,11 +3277,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     for (auto p : _h_x) { delete p; }
     for (auto p : _h_b) { delete p; }
 
-    if (!is_staggered) {
-      delete in;
-      delete collected_gauge;
-    }
-
     if (input_clover) { delete input_clover; }
     if (collected_clover) { delete collected_clover; }
 
@@ -3290,10 +3284,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
     // Restore the gauge field
     if (!is_staggered) {
-      loadGaugeQuda(h_gauge, &gauge_param);
+      loadGaugeQuda(h_gauge, gauge_param_);
     } else {
       freeGaugeQuda();
-      loadFatLongGaugeQuda(param, &gauge_param, milc_fatlinks, milc_longlinks);
+      loadFatLongGaugeQuda(param, gauge_param_, milc_fatlinks, milc_longlinks);
     }
 
     if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

From aa236f30901997c8075c4adc6546b7f1551cc636 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 13 Dec 2023 10:06:17 -0800
Subject: [PATCH 38/53] WAR for blowing out argument sizes for diluting typical
 staggered MG nc

---
 lib/spinor_dilute.in.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu
index 9b57458e8a..5472d828b2 100644
--- a/lib/spinor_dilute.in.cu
+++ b/lib/spinor_dilute.in.cu
@@ -87,7 +87,11 @@ namespace quda
                     const lat_dim_t &local_block, IntList<Nc, N...>)
   {
     if (src.Ncolor() == Nc) {
-      SpinorDilute<real, Ns, Nc>(src, v, type, local_block);
+      if constexpr (Nc <= 32) {
+        SpinorDilute<real, Ns, Nc>(src, v, type, local_block);
+      } else {
+        errorQuda("nColor = %d is too large to compile, see QUDA issues");
+      }
     } else {
       if constexpr (sizeof...(N) > 0)
         spinorDilute<real, Ns>(src, v, type, local_block, IntList<N...>());

From 48d7a21d4bfaf80322a097e7c1b6b9d749bb9b9c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 26 Dec 2023 06:44:48 -0800
Subject: [PATCH 39/53] Comment cleanup in eigensolver

---
 lib/eig_block_trlm.cpp |  8 ++++----
 lib/eig_trlm.cpp       | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp
index 18df423ad4..8a8f90063a 100644
--- a/lib/eig_block_trlm.cpp
+++ b/lib/eig_block_trlm.cpp
@@ -121,9 +121,9 @@ namespace quda
       // Locking check
       iter_locked = 0;
       for (int i = 1; i < (n_kr - num_locked); i++) {
-        if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/) {
+        if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/);
+                  epsilon * check_norm(alpha[i + num_locked]));
           iter_locked = i;
         } else {
           // Unlikely to find new locked pairs
@@ -134,9 +134,9 @@ namespace quda
       // Convergence check
       iter_converged = iter_locked;
       for (int i = iter_locked + 1; i < n_kr - num_locked; i++) {
-        if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) {
+        if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked])) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  tol * check_norm(alpha[i + num_locked]) /*mat_norm*/);
+                  tol * check_norm(alpha[i + num_locked]));
           iter_converged = i;
         } else {
           // Unlikely to find new converged pairs
diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp
index 99e3b465b5..2d6f1d2cac 100644
--- a/lib/eig_trlm.cpp
+++ b/lib/eig_trlm.cpp
@@ -102,9 +102,9 @@ namespace quda
       // Locking check
       iter_locked = 0;
       for (int i = 1; i < (n_kr - num_locked); i++) {
-        if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])/*mat_norm*/) {
+        if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/);
+                  epsilon * check_norm(alpha[i + num_locked]));
           iter_locked = i;
         } else {
           // Unlikely to find new locked pairs
@@ -115,9 +115,9 @@ namespace quda
       // Convergence check
       iter_converged = iter_locked;
       for (int i = iter_locked + 1; i < n_kr - num_locked; i++) {
-        if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) {
+        if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked])) {
           logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked],
-                  tol * check_norm(alpha[i + num_locked]) /*mat_norm*/);
+                  tol * check_norm(alpha[i + num_locked]));
           iter_converged = i;
         } else {
           // Unlikely to find new converged pairs
@@ -175,7 +175,7 @@ namespace quda
               n_conv, restart_iter, iter);
 
       // Dump all Ritz values and residua if using Chebyshev
-      for (int i = 0; i < n_conv /*&& eig_param->use_poly_acc*/; i++) {
+      for (int i = 0; i < n_conv; i++) {
         logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]);
       }
 

From d2b2372bff209f50a5359eaa0f8220a4d1cc633d Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 26 Dec 2023 07:20:54 -0800
Subject: [PATCH 40/53] doxygen

---
 tests/host_reference/dslash_reference.h       | 48 +++++++++++++++++
 .../staggered_dslash_reference.h              | 53 ++++++++++++++++++-
 2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 6331fbb65a..4d67134baa 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -109,17 +109,65 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 void *spinorCheck, QudaGaugeParam &gauge_param,
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
+/**
+  * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes
+  *        an array of outputs as is necessary for handling both single- and multi-shift solves.
+  *
+  * @param tmp A temporary spinor intermediate calculations
+  * @param ref A temporary reference field that is used to store the host verification solution
+  * @param in The initial rhs
+  * @param out The solution to A out = in
+  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+  * @param long_link The long links; null for naive staggered and Laplace
+  * @param inv_param Invert params, used to query the solve type, etc
+  * @return The residual and HQ residual (if requested)
+  */
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
                                 quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link,
                                 QudaInvertParam &inv_param);
 
+/**
+  * @brief Verify a single- or multi-shift staggered inversion on the host
+  *
+  * @param tmp A temporary spinor intermediate calculations
+  * @param ref A temporary reference field that is used to store the host verification solution
+  * @param in The initial rhs
+  * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve
+  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+  * @param long_link The long links; null for naive staggered and Laplace
+  * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts
+  * @return The residual and HQ residual (if requested)
+  */
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
                                 std::vector<quda::ColorSpinorField> &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link,
                                 QudaInvertParam &inv_param);
 
+/**
+  * @brief Verify a staggered-type eigenvector
+  *
+  * @param spinor The host eigenvector to be verified
+  * @param lambda The host eigenvalue to be verified
+  * @param i The number of the eigenvalue, only used when printing outputs
+  * @param eig_param Eigensolve params, used to query the operator type, etc
+  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+  * @param long_link The long links; null for naive staggered and Laplace
+  * @return The residual norm
+  */
 double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i,
                                       QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
 
+/**
+  * @brief Verify a staggered-type singular vector
+  *
+  * @param spinor The host left singular vector to be verified
+  * @param spinor_right The host right singular vector to be verified
+  * @param lambda The host singular value to be verified
+  * @param i The number of the singular value, only used when printing outputs
+  * @param eig_param Eigensolve params, used to query the operator type, etc
+  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+  * @param long_link The long links; null for naive staggered and Laplace
+  * @return The residual norm
+  */
 double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i,
                                          QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
 
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 9fc6c9d641..b81f0fcb7a 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -11,16 +11,67 @@ using namespace quda;
 
 void setDims(int *);
 
+/**
+  * @brief Base host routine to apply the even-odd or odd-even component of a staggered-type dslash
+  *
+  * @tparam real_t Datatype used in the host dslash
+  * @param res Host output result
+  * @param fatlink Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+  * @param longlink Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+  * @param ghostFatlink Ghost zones for the host fat links
+  * @param ghostLonglink Ghost zones for the host long links
+  * @param spinorField Host input spinor
+  * @param fwd_nbr_spinor Forward ghost zones for the host input spinor
+  * @param back_nbr_spinor Backwards ghost zones for the host input spinor
+  * @param oddBit 0 for D_eo, 1 for D_oe
+  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+  * @param dslash_type Dslash type
+  */
 template <typename real_t>
 void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink,
                               real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor,
-                              real_t **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type);
+                              real_t **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type);
 
+/**
+  * @brief Apply even-odd or odd-even component of a staggered-type dslash
+  *
+  * @param out Host output rhs
+  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+  * @param in Host input spinor
+  * @param oddBit 0 for D_eo, 1 for D_oe
+  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+  * @param dslash_type Dslash type
+  */
 void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
                  int oddBit, int daggerBit, QudaDslashType dslash_type);
 
+/**
+  * @brief Apply the full parity staggered-type dslash
+  *
+  * @param out Host output rhs
+  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+  * @param in Host input spinor
+  * @param mass Mass for the dslash operator
+  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+  * @param dslash_type Dslash type
+  */
 void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
               double mass, int daggerBit, QudaDslashType dslash_type);
 
+/**
+  * @brief Apply the even-even or odd-odd preconditioned staggered dslash
+  *
+  * @param out Host output rhs
+  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+  * @param in Host input spinor
+  * @param mass Mass for the dslash operator
+  * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator
+  * @param tmp Temporary spinor field
+  * @param parity Parity of preconditioned dslash
+  * @param dslash_type Dslash type
+  */
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
                 double mass, int dagger_bit, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type);

From 9b631c300c53c7ef8fabcacb3bb56fc0597b030b Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 26 Dec 2023 07:42:13 -0800
Subject: [PATCH 41/53] Cleaned up some unnecessary temporary fields outside of
 verify functions

---
 tests/host_reference/dslash_reference.cpp     | 24 +++++++++++--------
 tests/host_reference/dslash_reference.h       | 14 ++++-------
 .../staggered_dslash_reference.cpp            |  6 ++++-
 .../staggered_dslash_reference.h              |  3 +--
 tests/staggered_dslash_test_utils.h           |  2 +-
 tests/staggered_invert_test.cpp               | 13 +++++-----
 6 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 0b461076fd..fb04d173f7 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -743,22 +743,26 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
   return l2r;
 }
 
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                QudaInvertParam &inv_param) {
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link,
+                                quda::GaugeField &long_link, QudaInvertParam &inv_param) {
   std::vector<quda::ColorSpinorField> out_vector(1);
   out_vector[0] = out;
-  return verifyStaggeredInversion(tmp, ref, in, out_vector, fat_link,
+  return verifyStaggeredInversion(in, out_vector, fat_link,
                                   long_link, inv_param);
 }
 
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                std::vector<quda::ColorSpinorField> &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                QudaInvertParam &inv_param)
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector<quda::ColorSpinorField> &out_vector,
+                                               quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param)
 {
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   double l2r_max = 0.0;
   double hqr_max = 0.0;
+
+  // Create temporary spinors
+  quda::ColorSpinorParam csParam(in);
+  quda::ColorSpinorField ref(csParam);
+  quda::ColorSpinorField tmp(csParam);
+
   if (multishift > 1) {
     if (dslash_type == QUDA_LAPLACE_DSLASH)
       errorQuda("Multishift solves do not support the laplace operator (yet)");
@@ -777,7 +781,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
     for (int i = 0; i < multishift; i++) {
       auto& out = out_vector[i];
       double mass = 0.5 * sqrt(inv_param.offset[i]);
-      stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type);
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
 
       mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
       double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
@@ -815,7 +819,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
         case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
         default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
       }
-      stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type);
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
     } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
       stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type);
       stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
@@ -873,7 +877,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co
       case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
       default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
     }
-    stag_matpc(ref, fat_link, long_link, spinor, mass, 0, tmp, parity, dslash_type);
+    stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type);
   } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) {
     stag_mat(tmp, fat_link, long_link, spinor, mass, dagger, dslash_type);
     stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 4d67134baa..0388b2a10d 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -113,8 +113,6 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
   * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes
   *        an array of outputs as is necessary for handling both single- and multi-shift solves.
   *
-  * @param tmp A temporary spinor intermediate calculations
-  * @param ref A temporary reference field that is used to store the host verification solution
   * @param in The initial rhs
   * @param out The solution to A out = in
   * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
@@ -122,15 +120,12 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
   * @param inv_param Invert params, used to query the solve type, etc
   * @return The residual and HQ residual (if requested)
   */
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                QudaInvertParam &inv_param);
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link,
+                                quda::GaugeField &long_link, QudaInvertParam &inv_param);
 
 /**
   * @brief Verify a single- or multi-shift staggered inversion on the host
   *
-  * @param tmp A temporary spinor intermediate calculations
-  * @param ref A temporary reference field that is used to store the host verification solution
   * @param in The initial rhs
   * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve
   * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
@@ -138,9 +133,8 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda
   * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts
   * @return The residual and HQ residual (if requested)
   */
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                std::vector<quda::ColorSpinorField> &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                QudaInvertParam &inv_param);
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector<quda::ColorSpinorField> &out_vector,
+                                quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param);
 
 /**
   * @brief Verify a staggered-type eigenvector
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 4b04da1976..0f1b9c46dd 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -193,7 +193,7 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
 }
 
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int,
-                ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type)
+                QudaParity parity, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); }
@@ -207,6 +207,10 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi
     errorQuda("full parity not supported in function");
   }
 
+  // Create temporary spinors
+  quda::ColorSpinorParam csParam(in);
+  quda::ColorSpinorField tmp(csParam);
+
   // dagger bit does not matter
   stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type);
   stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type);
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index b81f0fcb7a..7b95adb318 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -69,9 +69,8 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
   * @param in Host input spinor
   * @param mass Mass for the dslash operator
   * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator
-  * @param tmp Temporary spinor field
   * @param parity Parity of preconditioned dslash
   * @param dslash_type Dslash type
   */
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
-                double mass, int dagger_bit, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type);
+                double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type);
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index e7eb39b07f..52a2f0b124 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -86,7 +86,7 @@ struct StaggeredDslashTestWrapper {
       stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type);
       break;
     case dslash_test_type::MatPC:
-      stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type);
+      stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, parity, dslash_type);
       break;
     case dslash_test_type::Mat:
       stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 1c941a59a9..34a63c212f 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -280,12 +280,8 @@ std::vector<std::array<double, 2>> solve(test_t param)
   std::vector<quda::ColorSpinorField> in(Nsrc);
   std::vector<quda::ColorSpinorField> out(Nsrc);
   std::vector<quda::ColorSpinorField> out_multishift(Nsrc * multishift);
-  quda::ColorSpinorField ref;
-  quda::ColorSpinorField tmp;
   quda::ColorSpinorParam cs_param;
   constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param);
-  ref = quda::ColorSpinorField(cs_param);
-  tmp = quda::ColorSpinorField(cs_param);
   std::vector<std::vector<void *>> _hp_multi_x(Nsrc, std::vector<void*>(multishift));
 
   // Staggered vector construct END
@@ -340,7 +336,10 @@ std::vector<std::array<double, 2>> solve(test_t param)
   std::vector<double> gflops(Nsrc);
   std::vector<int> iter(Nsrc);
 
-  quda::RNG rng(ref, 1234);
+  // Create a temporary spinor just to seed the rng
+  quda::ColorSpinorField tmp(cs_param);
+  quda::RNG rng(tmp, 1234);
+  tmp = quda::ColorSpinorField();
 
   for (int n = 0; n < Nsrc; n++) {
     // Populate the host spinor with random numbers.
@@ -410,9 +409,9 @@ std::vector<std::array<double, 2>> solve(test_t param)
         printfQuda("\nSource %d:\n", n);
         // Create an appropriate subset of the full out_multishift vector
         std::vector<quda::ColorSpinorField> out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift};
-        res[n] = verifyStaggeredInversion(tmp, ref, in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param);
+        res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param);
       } else {
-        res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param);
+        res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param);
       }
     }
   }

From 0f366c13f7fc01ed9b098464d339e3befb710bac Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 26 Dec 2023 07:56:00 -0800
Subject: [PATCH 42/53] Added a simple staggered host stag_matdag_mat verify
 function

---
 tests/host_reference/dslash_reference.cpp     | 11 +++--------
 .../staggered_dslash_reference.cpp            | 19 +++++++++++++++++++
 .../staggered_dslash_reference.h              | 14 ++++++++++++++
 tests/staggered_dslash_test_utils.h           |  6 +-----
 4 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index fb04d173f7..534d0b2a1b 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -747,8 +747,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda:
                                 quda::GaugeField &long_link, QudaInvertParam &inv_param) {
   std::vector<quda::ColorSpinorField> out_vector(1);
   out_vector[0] = out;
-  return verifyStaggeredInversion(in, out_vector, fat_link,
-                                  long_link, inv_param);
+  return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param);
 }
 
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector<quda::ColorSpinorField> &out_vector,
@@ -761,7 +760,6 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
   // Create temporary spinors
   quda::ColorSpinorParam csParam(in);
   quda::ColorSpinorField ref(csParam);
-  quda::ColorSpinorField tmp(csParam);
 
   if (multishift > 1) {
     if (dslash_type == QUDA_LAPLACE_DSLASH)
@@ -821,8 +819,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
       }
       stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
     } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-      stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type);
-      stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
+      stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
     } else {
       errorQuda("Invalid staggered solution type %d", inv_param.solution_type);
     }
@@ -866,7 +863,6 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co
   // Create temporary spinors
   quda::ColorSpinorParam csParam(spinor);
   quda::ColorSpinorField ref(csParam);
-  quda::ColorSpinorField tmp(csParam);
 
   if (sol_type == QUDA_MAT_SOLUTION) {
     stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type);
@@ -879,8 +875,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co
     }
     stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type);
   } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) {
-    stag_mat(tmp, fat_link, long_link, spinor, mass, dagger, dslash_type);
-    stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type);
+    stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type);
   }
 
   // Compute M * x - \lambda * x
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 0f1b9c46dd..4eb46214e7 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -192,6 +192,25 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
   }
 }
 
+void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
+{
+  // assert sPrecision and gPrecision must be the same
+  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
+
+  // assert we have full-parity spinors
+  if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
+    errorQuda("Unexpected site subsets for stag_matdagmat, out %d in %d", out.SiteSubset(), in.SiteSubset());
+
+  // Create temporary spinors
+  quda::ColorSpinorParam csParam(in);
+  quda::ColorSpinorField tmp(csParam);
+
+  // Apply mat in sequence
+  stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type);
+  stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type);
+}
+
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int,
                 QudaParity parity, QudaDslashType dslash_type)
 {
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 7b95adb318..c5b73d980b 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -60,6 +60,20 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
 void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
               double mass, int daggerBit, QudaDslashType dslash_type);
 
+/**
+  * @brief Apply the full parity staggered-type matdag_mat
+  *
+  * @param out Host output rhs
+  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+  * @param in Host input spinor
+  * @param mass Mass for the dslash operator
+  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+  * @param dslash_type Dslash type
+  */
+void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
+              double mass, int daggerBit, QudaDslashType dslash_type);
+
 /**
   * @brief Apply the even-even or odd-odd preconditioned staggered dslash
   *
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 52a2f0b124..810c045863 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -53,7 +53,6 @@ struct StaggeredDslashTestWrapper {
   static inline ColorSpinorField spinor;
   static inline ColorSpinorField spinorOut;
   static inline ColorSpinorField spinorRef;
-  static inline ColorSpinorField tmpCpu;
 
   ColorSpinorField cudaSpinor;
   ColorSpinorField cudaSpinorOut;
@@ -92,8 +91,7 @@ struct StaggeredDslashTestWrapper {
       stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
       break;
     case dslash_test_type::MatDagMat:
-      stag_mat(tmpCpu, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
-      stag_mat(spinorRef, cpuFat, cpuLong, tmpCpu, mass, 1 - dagger, dslash_type);
+      stag_matdag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
       break;
     default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
     }
@@ -201,7 +199,6 @@ struct StaggeredDslashTestWrapper {
     spinor = ColorSpinorField(csParam);
     spinorOut = ColorSpinorField(csParam);
     spinorRef = ColorSpinorField(csParam);
-    tmpCpu = ColorSpinorField(csParam);
 
     spinor.Source(QUDA_RANDOM_SOURCE);
 
@@ -307,7 +304,6 @@ struct StaggeredDslashTestWrapper {
     spinor = {};
     spinorOut = {};
     spinorRef = {};
-    tmpCpu = {};
 
     if (test_split_grid) {
       vp_spinor.clear();

From 91211105d66ca6cd62cdc03857ad40ee505e9cc0 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 26 Dec 2023 08:01:42 -0800
Subject: [PATCH 43/53] Added a few extra parity checks to staggered dslash
 host verifies

---
 tests/host_reference/staggered_dslash_reference.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 4eb46214e7..8fdc135cb2 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -131,6 +131,10 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
 
+  // assert we have single-parity spinors
+  if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET)
+    errorQuda("Unexpected site subsets for stag_dslash, out %d in %d", out.SiteSubset(), in.SiteSubset());
+
   QudaParity otherparity = QUDA_INVALID_PARITY;
   if (oddBit == QUDA_EVEN_PARITY) {
     otherparity = QUDA_ODD_PARITY;
@@ -217,6 +221,10 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); }
 
+  // assert we have single-parity spinors
+  if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET)
+    errorQuda("Unexpected site subsets for stag_matpc, out %d in %d", out.SiteSubset(), in.SiteSubset());
+
   QudaParity otherparity = QUDA_INVALID_PARITY;
   if (parity == QUDA_EVEN_PARITY) {
     otherparity = QUDA_ODD_PARITY;

From a1303bd612cf8dfaf5b7a7070cd5871ad4b6b5ce Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 3 Jan 2024 14:06:46 -0800
Subject: [PATCH 44/53] Commented out the asqtad spectrum ctests

---
 tests/CMakeLists.txt | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5debeed21d..d6de7dcf5c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1402,13 +1402,15 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true
       --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
 
-    add_test(NAME eigensolve_test_asqtad_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type asqtad --compute-fat-long true
-      --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256
-      --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
-      --enable-testing true
-      --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
+    # Skipping this because it's both time consuming and doesn't have any novel spectral
+    # properties relative to unimproved staggered
+    #add_test(NAME eigensolve_test_asqtad_${prec}
+    #  COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    #  --dslash-type asqtad --compute-fat-long true
+    #  --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256
+    #  --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
+    #  --enable-testing true
+    #  --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
 
     if (QUDA_LAPLACE)
       add_test(NAME eigensolve_test_laplace_${prec}

From 3252b6cdf1c1e66a27ce27677e9b776014e15190 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 3 Jan 2024 14:08:37 -0800
Subject: [PATCH 45/53] Removed twisted mass from the CI pipeline, other 4-d
 Wilson ops are still covered

---
 ci/docker/Dockerfile.build | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/docker/Dockerfile.build b/ci/docker/Dockerfile.build
index 3bd1f20e8e..5322ddfd2d 100644
--- a/ci/docker/Dockerfile.build
+++ b/ci/docker/Dockerfile.build
@@ -40,7 +40,6 @@ RUN  QUDA_TEST_GRID_SIZE="1 1 1 2" cmake -S /quda/src \
     -DQUDA_DIRAC_DEFAULT_OFF=ON \
     -DQUDA_DIRAC_WILSON=ON \
     -DQUDA_DIRAC_CLOVER=ON \
-    -DQUDA_DIRAC_TWISTED_MASS=ON \
     -DQUDA_DIRAC_TWISTED_CLOVER=ON \
     -DQUDA_DIRAC_STAGGERED=ON \
     -GNinja \

From c8d301b9bc256e8068e6ebc3829ce82efdecc53c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 3 Jan 2024 14:46:43 -0800
Subject: [PATCH 46/53] Added an explicit link to the Nc = 64, 96 issue in
 spinor_dilute.in.cu

---
 lib/spinor_dilute.in.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu
index 5472d828b2..4ac97bc9fb 100644
--- a/lib/spinor_dilute.in.cu
+++ b/lib/spinor_dilute.in.cu
@@ -90,7 +90,7 @@ namespace quda
       if constexpr (Nc <= 32) {
         SpinorDilute<real, Ns, Nc>(src, v, type, local_block);
       } else {
-        errorQuda("nColor = %d is too large to compile, see QUDA issues");
+        errorQuda("nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)");
       }
     } else {
       if constexpr (sizeof...(N) > 0)

From 6b0ba62df10a339e90c454a3fc893a3bbddc1985 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 3 Jan 2024 20:38:40 -0800
Subject: [PATCH 47/53] Cleaned up C-style casts, plus unnecessary newlines in
 errorQuda

---
 .../staggered_dslash_reference.cpp            | 33 +++++++++++--------
 tests/staggered_invert_test.cpp               |  4 +--
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 8fdc135cb2..cf053533da 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -158,16 +158,23 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
     = {long_link.Ghost()[0].data(), long_link.Ghost()[1].data(), long_link.Ghost()[2].data(), long_link.Ghost()[3].data()};
 
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
-    // note: qdp_fatlink and qdp_longlink, etc, can be replaced with feature/openmp's raw_pointer
-    staggeredDslashReference((double *)out.data(), (double **)qdp_fatlink, (double **)qdp_longlink,
-                             (double**)ghost_fatlink, (double**)ghost_longlink,
-                             (double *)in.data(), (double **)fwd_nbr_spinor,
-                             (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
+    staggeredDslashReference(static_cast<double*>(out.data()),
+                             reinterpret_cast<double**>(qdp_fatlink),
+                             reinterpret_cast<double**>(qdp_longlink),
+                             reinterpret_cast<double**>(ghost_fatlink),
+                             reinterpret_cast<double**>(ghost_longlink),
+                             static_cast<double*>(in.data()),
+                             reinterpret_cast<double**>(in.fwdGhostFaceBuffer),
+                             reinterpret_cast<double**>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
   } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
-    staggeredDslashReference((float *)out.data(), (float **)qdp_fatlink, (float **)qdp_longlink,
-                             (float**)ghost_fatlink, (float**)ghost_longlink,
-                             (float *)in.data(), (float **)fwd_nbr_spinor,
-                             (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
+    staggeredDslashReference(static_cast<float*>(out.data()),
+                             reinterpret_cast<float**>(qdp_fatlink),
+                             reinterpret_cast<float**>(qdp_longlink),
+                             reinterpret_cast<float**>(ghost_fatlink),
+                             reinterpret_cast<float**>(ghost_longlink),
+                             static_cast<float*>(in.data()),
+                             reinterpret_cast<float**>(in.fwdGhostFaceBuffer),
+                             reinterpret_cast<float**>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
   }
 }
 
@@ -190,9 +197,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
 
   if (dslash_type == QUDA_LAPLACE_DSLASH) {
     double kappa = 1.0 / (8 + mass);
-    xpay((void*)in.data(), kappa, out.data(), out.Length(), out.Precision());
+    xpay(in.data(), kappa, out.data(), out.Length(), out.Precision());
   } else {
-    axpy(2 * mass, (void*)in.data(), out.data(), out.Length(), out.Precision());
+    axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision());
   }
 }
 
@@ -244,8 +251,8 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi
 
   double msq_x4 = mass * mass * 4;
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
-    axmy((double *)in.data(), (double)msq_x4, (double *)out.data(), Vh * stag_spinor_site_size);
+    axmy(static_cast<double*>(in.data()), msq_x4, static_cast<double*>(out.data()), Vh * stag_spinor_site_size);
   } else {
-    axmy((float *)in.data(), (float)msq_x4, (float *)out.data(), Vh * stag_spinor_site_size);
+    axmy(static_cast<float*>(in.data()), static_cast<float>(msq_x4), static_cast<float*>(out.data()), Vh * stag_spinor_site_size);
   }
 }
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 34a63c212f..7a288535fc 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -459,7 +459,7 @@ int main(int argc, char **argv)
   }
 
   if (inv_deflate && inv_multigrid)
-    errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n");
+    errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve");
 
   initRand();
 
@@ -477,7 +477,7 @@ int main(int argc, char **argv)
   // Need to add support for LAPLACE MG?
   if (inv_multigrid) {
     if (!is_staggered(dslash_type)) {
-      errorQuda("dslash_type %s not supported for multigrid preconditioner\n", get_dslash_str(dslash_type));
+      errorQuda("dslash_type %s not supported for multigrid preconditioner", get_dslash_str(dslash_type));
     }
   }
 

From 4e533feb810b3f3ba27c42729a8059c5c0b286a1 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 3 Jan 2024 20:48:34 -0800
Subject: [PATCH 48/53] Added a cmake flag
 QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST to toggle running eigensolver
 dslash tests on the improved staggered operator, which are expensive

---
 CMakeLists.txt       |  8 +++++++-
 tests/CMakeLists.txt | 19 ++++++++++---------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e66f12f5ce..eb8d85468f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -228,6 +228,8 @@ option(QUDA_CLOVER_DYNAMIC "Dynamically invert the clover term" ON)
 option(QUDA_CLOVER_RECONSTRUCT "set to ON to enable compressed clover storage (requires QUDA_CLOVER_DYNAMIC)" ON)
 option(QUDA_CLOVER_CHOLESKY_PROMOTE "Whether to promote the internal precision when inverting the clover term" ON)
 
+option(QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST "Whether to run eigensolver ctests against the improved staggered operator (requires QUDA_DIRAC_STAGGERED)" OFF)
+
 # Set CTest options
 option(QUDA_CTEST_SEP_DSLASH_POLICIES "Test Dslash policies separately in ctest instead of only autotuning them." OFF)
 option(QUDA_CTEST_DISABLE_BENCHMARKS "Disable benchmark test" ON)
@@ -391,7 +393,11 @@ set(CMAKE_EXE_LINKER_FLAGS_SANITIZE
     CACHE STRING "Flags used by the linker during sanitizer debug builds.")
 
 if(QUDA_CLOVER_RECONSTRUCT AND NOT QUDA_CLOVER_DYNAMIC)
-  message(SEND_ERROR "QUDA_CLOVER_RECONSTRUCT requires QUDA_CLOVER_DYNAMIC)")
+  message(SEND_ERROR "QUDA_CLOVER_RECONSTRUCT requires QUDA_CLOVER_DYNAMIC")
+endif()
+
+if (QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST AND NOT QUDA_DIRAC_STAGGERED)
+  message(SEND_ERROR "QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST requires QUDA_DIRAC_STAGGERED")
 endif()
 
 find_package(Threads REQUIRED)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d6de7dcf5c..10088fc05b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1402,15 +1402,16 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true
       --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
 
-    # Skipping this because it's both time consuming and doesn't have any novel spectral
-    # properties relative to unimproved staggered
-    #add_test(NAME eigensolve_test_asqtad_${prec}
-    #  COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
-    #  --dslash-type asqtad --compute-fat-long true
-    #  --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256
-    #  --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
-    #  --enable-testing true
-    #  --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
+    # These tests are particularly expensive so they are disabled by default
+    if(QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST)
+      add_test(NAME eigensolve_test_asqtad_${prec}
+        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
+        --dslash-type asqtad --compute-fat-long true
+        --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256
+        --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
+        --enable-testing true
+        --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
+    endif()
 
     if (QUDA_LAPLACE)
       add_test(NAME eigensolve_test_laplace_${prec}

From 1228db9e5941575175d547322d0d4e193ba49ede Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 3 Jan 2024 20:58:11 -0800
Subject: [PATCH 49/53] is_laplace_enabled -> is_enabled_laplace, other misc
 cleanup

---
 tests/host_reference/staggered_dslash_reference.cpp | 3 ---
 tests/staggered_dslash_ctest.cpp                    | 2 +-
 tests/staggered_dslash_test.cpp                     | 2 +-
 tests/staggered_eigensolve_test.cpp                 | 2 +-
 tests/staggered_eigensolve_test_gtest.hpp           | 8 --------
 tests/staggered_invert_test.cpp                     | 2 +-
 tests/utils/host_utils.h                            | 6 ++++--
 7 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index cf053533da..610f81a0b4 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -147,9 +147,6 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
 
   in.exchangeGhost(otherparity, nFace, daggerBit);
 
-  auto fwd_nbr_spinor = in.fwdGhostFaceBuffer;
-  auto back_nbr_spinor = in.backGhostFaceBuffer;
-
   void *qdp_fatlink[] = {fat_link.data(0), fat_link.data(1), fat_link.data(2), fat_link.data(3)};
   void *qdp_longlink[] = {long_link.data(0), long_link.data(1), long_link.data(2), long_link.data(3)};
   void *ghost_fatlink[]
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index c035013568..2d5311632a 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -122,7 +122,7 @@ int main(int argc, char **argv)
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
 
   // Only these fermions are supported in this file
-  if (is_laplace_enabled) {
+  if constexpr (is_enabled_laplace()) {
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index 0beb48f887..7905d39db6 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -85,7 +85,7 @@ int main(int argc, char **argv)
   if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
 
   // Only these fermions are supported in this file
-  if (is_laplace_enabled) {
+  if constexpr (is_enabled_laplace()) {
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 85136b3972..e971e0327e 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -273,7 +273,7 @@ int main(int argc, char **argv)
   initRand();
 
   // Only these fermions are supported in this file
-  if (is_laplace_enabled) {
+  if constexpr (is_enabled_laplace()) {
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index ab879a9b37..6cf272cb5d 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -101,14 +101,6 @@ bool skip_test(test_t test_param)
     default: return true; break;
     }
   }
-
-  // There seems to be some stubborn issue with this combination on 2xGPUs that I can't quite
-  // comprehend, and am a bit tired to debugging.
-  //if (prec == QUDA_SINGLE_PRECISION && dslash_type == QUDA_ASQTAD_DSLASH &&
-  //    eig_type == QUDA_EIG_BLK_TR_LANCZOS && spectrum == QUDA_SPECTRUM_SR_EIG &&
-  //    combo_solve_type == QUDA_DIRECT_PC_SOLVE)
-  //    return true;
-
   return false;
 }
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 7a288535fc..dea21be65e 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -464,7 +464,7 @@ int main(int argc, char **argv)
   initRand();
 
   // Only these fermions are supported in this file
-  if (is_laplace_enabled) {
+  if constexpr (is_enabled_laplace()) {
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index b9449e8651..9431b3ce67 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -41,11 +41,13 @@ extern QudaPrecision &cuda_prec_refinement_sloppy;
 extern QudaPrecision &cuda_prec_ritz;
 
 // Determine if the Laplace operator has been defined
+constexpr bool is_enabled_laplace() {
 #ifdef QUDA_LAPLACE
-constexpr bool is_laplace_enabled = true;
+  return true;
 #else
-constexpr bool is_laplace_enabled = false;
+  return false;
 #endif
+}
 
 // Set some basic parameters via command line or use defaults
 // Implemented in set_params.cpp

From 53dced96630d2269daa8f03eef60ff83c7a8dcac Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 4 Jan 2024 10:53:31 -0800
Subject: [PATCH 50/53] Small stylistic updates to BiCGstab to match
 conventions in other modern inverters

---
 include/invert_quda.h     | 12 +++++++++++
 lib/inv_bicgstab_quda.cpp | 44 ++++++++++++++++++++++++++-------------
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/include/invert_quda.h b/include/invert_quda.h
index 7cf26a6f4f..ef2923f07c 100644
--- a/include/invert_quda.h
+++ b/include/invert_quda.h
@@ -1059,6 +1059,13 @@ namespace quda {
     ColorSpinorField x_sloppy; // Sloppy solution accumulator vector
     bool init = false;
 
+    /**
+       @brief Initiate the fields needed by the solver
+       @param[in] x Solution vector
+       @param[in] b Source vector
+    */
+    void create(ColorSpinorField &x, const ColorSpinorField &b);
+
   public:
     BiCGstab(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon,
              const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile);
@@ -1066,6 +1073,11 @@ namespace quda {
 
     void operator()(ColorSpinorField &out, ColorSpinorField &in) override;
 
+    /**
+       @return Return the residual vector from the prior solve
+    */
+    ColorSpinorField &get_residual() override;
+
     virtual bool hermitian() const override { return false; } /** BiCGStab is for any linear system */
 
     virtual QudaInverterType getInverterType() const final { return QUDA_BICGSTAB_INVERTER; }
diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index 3fa5afd849..0b0c039baa 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -25,6 +25,33 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_FREE);
   }
 
+  void BiCGstab::create(ColorSpinorField &x, const ColorSpinorField &b)
+  {
+    Solver::create(x, b);
+
+    if (!init) {
+      if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
+      ColorSpinorParam csParam(x);
+      csParam.create = QUDA_ZERO_FIELD_CREATE;
+      y = ColorSpinorField(csParam);
+      r = ColorSpinorField(csParam);
+      csParam.setPrecision(param.precision_sloppy);
+      p = ColorSpinorField(csParam);
+      v = ColorSpinorField(csParam);
+      t = ColorSpinorField(csParam);
+
+      if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT);
+      init = true;
+    } // init
+  }
+
+  ColorSpinorField &BiCGstab::get_residual()
+  {
+    if (!init) errorQuda("No residual vector present");
+    if (!param.return_residual) errorQuda("SolverParam::return_residual not enabled");
+    return r;
+  }
+
   int reliable(double &rNorm, double &maxrx, double &maxrr, const double &r2, const double &delta) {
     // reliable updates
     rNorm = sqrt(r2);
@@ -41,6 +68,8 @@ namespace quda {
 
   void BiCGstab::operator()(ColorSpinorField &x, ColorSpinorField &b)
   {
+    create(x, b);
+
     if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
 
     double b2 = blas::norm2(b); // norm sq of source
@@ -53,7 +82,7 @@ namespace quda {
         x = b;
         param.true_res = 0.0;
         param.true_res_hq = 0.0;
-        profile.TPSTOP(QUDA_PROFILE_INIT);
+        if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT);
         return;
       } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
         b2 = r2;
@@ -62,19 +91,6 @@ namespace quda {
       }
     }
 
-    if (!init) {
-      ColorSpinorParam csParam(x);
-      csParam.create = QUDA_ZERO_FIELD_CREATE;
-      y = ColorSpinorField(csParam);
-      r = ColorSpinorField(csParam);
-      csParam.setPrecision(param.precision_sloppy);
-      p = ColorSpinorField(csParam);
-      v = ColorSpinorField(csParam);
-      t = ColorSpinorField(csParam);
-
-      init = true;
-    }
-
     if (param.deflate) {
       // Construct the eigensolver and deflation space if requested.
       if (param.eig_param.eig_type == QUDA_EIG_TR_LANCZOS || param.eig_param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) {

From 209d554611d12676d769fbb49b1c4ad5e1decd2f Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 5 Jan 2024 11:11:08 -0800
Subject: [PATCH 51/53] Fixed using BiCGstab for generating near-null vectors

---
 lib/inv_bicgstab_quda.cpp | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index 0b0c039baa..1d586ee124 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -75,22 +75,6 @@ namespace quda {
     double b2 = blas::norm2(b); // norm sq of source
     double r2 = 0.0;            // norm sq of residual
 
-    // Check to see that we're not trying to invert on a zero-field source
-    if (b2 == 0) {
-      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) {
-        warningQuda("inverting on zero-field source");
-        x = b;
-        param.true_res = 0.0;
-        param.true_res_hq = 0.0;
-        if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT);
-        return;
-      } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
-        b2 = r2;
-      } else {
-        errorQuda("Null vector computing requires non-zero guess!");
-      }
-    }
-
     if (param.deflate) {
       // Construct the eigensolver and deflation space if requested.
       if (param.eig_param.eig_type == QUDA_EIG_TR_LANCZOS || param.eig_param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) {
@@ -140,6 +124,22 @@ namespace quda {
       r2 = blas::xmyNorm(b, r);
     }
 
+    // Check to see that we're not trying to invert on a zero-field source
+    if (b2 == 0) {
+      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) {
+        warningQuda("inverting on zero-field source");
+        x = b;
+        param.true_res = 0.0;
+        param.true_res_hq = 0.0;
+        if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT);
+        return;
+      } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
+        b2 = r2;
+      } else {
+        errorQuda("Null vector computing requires non-zero guess!");
+      }
+    }
+
     // set field aliasing according to whether we are doing mixed precision or not
     if (param.precision_sloppy == x.Precision()) {
       r_sloppy = r.create_alias();

From ca6b814bf71f0ff9a8306cb83604a5cd86c1ee71 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 9 Jan 2024 13:45:57 -0800
Subject: [PATCH 52/53] clang-format

---
 include/invert_quda.h                         |  12 +-
 lib/eig_block_trlm.cpp                        |   2 +-
 lib/eig_trlm.cpp                              |   2 +-
 lib/interface_quda.cpp                        |  10 +-
 lib/inv_bicgstab_quda.cpp                     |  42 +++---
 lib/spinor_dilute.in.cu                       |   3 +-
 tests/hisq_stencil_ctest.cpp                  |  44 +++---
 tests/hisq_stencil_test.cpp                   |  26 ++--
 tests/hisq_stencil_test_utils.h               |  80 ++++++-----
 tests/host_reference/dslash_reference.cpp     |  78 ++++++-----
 tests/host_reference/dslash_reference.h       |  98 +++++++-------
 .../staggered_dslash_reference.cpp            |  53 ++++----
 .../staggered_dslash_reference.h              | 128 +++++++++---------
 tests/invert_test_gtest.hpp                   |   4 +-
 tests/staggered_dslash_ctest.cpp              |   6 +-
 tests/staggered_dslash_test.cpp               |  10 +-
 tests/staggered_dslash_test_utils.h           |  13 +-
 tests/staggered_eigensolve_test.cpp           |  45 +++---
 tests/staggered_eigensolve_test_gtest.hpp     |  22 ++-
 tests/staggered_invert_test.cpp               |  48 ++++---
 tests/staggered_invert_test_gtest.hpp         |  53 ++++----
 tests/utils/command_line_params.cpp           |  23 ++--
 tests/utils/host_utils.cpp                    |   2 +-
 tests/utils/host_utils.h                      |   3 +-
 tests/utils/staggered_gauge_utils.cpp         |   3 +-
 25 files changed, 418 insertions(+), 392 deletions(-)

diff --git a/include/invert_quda.h b/include/invert_quda.h
index ef2923f07c..7ab7a1138c 100644
--- a/include/invert_quda.h
+++ b/include/invert_quda.h
@@ -1049,12 +1049,12 @@ namespace quda {
   private:
     const DiracMdagM matMdagM; // used by the eigensolver
 
-    ColorSpinorField y; // Full precision solution accumulator
-    ColorSpinorField r; // Full precision residual vector
-    ColorSpinorField p; // Sloppy precision search direction
-    ColorSpinorField v; // Sloppy precision A * p
-    ColorSpinorField t; // Sloppy precision vector used for minres step
-    ColorSpinorField r0; // Bi-orthogonalization vector
+    ColorSpinorField y;        // Full precision solution accumulator
+    ColorSpinorField r;        // Full precision residual vector
+    ColorSpinorField p;        // Sloppy precision search direction
+    ColorSpinorField v;        // Sloppy precision A * p
+    ColorSpinorField t;        // Sloppy precision vector used for minres step
+    ColorSpinorField r0;       // Bi-orthogonalization vector
     ColorSpinorField r_sloppy; // Slopy precision residual vector
     ColorSpinorField x_sloppy; // Sloppy solution accumulator vector
     bool init = false;
diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp
index 8a8f90063a..890257c1ed 100644
--- a/lib/eig_block_trlm.cpp
+++ b/lib/eig_block_trlm.cpp
@@ -111,7 +111,7 @@ namespace quda
 
       // Lambda that returns mat_norm for LR and returns the relevant alpha
       // (the corresponding Ritz value) for SR
-      auto check_norm = [&] (double sr_norm) -> double {
+      auto check_norm = [&](double sr_norm) -> double {
         if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG)
           return mat_norm;
         else
diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp
index 2d6f1d2cac..d994fbc272 100644
--- a/lib/eig_trlm.cpp
+++ b/lib/eig_trlm.cpp
@@ -92,7 +92,7 @@ namespace quda
 
       // Lambda that returns mat_norm for LR and returns the relevant alpha
       // (the corresponding Ritz value) for SR
-      auto check_norm = [&] (double sr_norm) -> double {
+      auto check_norm = [&](double sr_norm) -> double {
         if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG)
           return mat_norm;
         else
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index a47f15a794..46fed2c43d 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3014,7 +3014,7 @@ void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_para
 template <class Interface, class... Args>
 void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // color spinor field pointers, and inv_param
                       void *h_gauge, void *milc_fatlinks, void *milc_longlinks,
-                      QudaGaugeParam *gauge_param_,     // gauge field pointers
+                      QudaGaugeParam *gauge_param_,    // gauge field pointers
                       void *h_clover, void *h_clovinv, // clover field pointers
                       Interface op, Args... args)
 {
@@ -3036,8 +3036,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
   // Create a local copy of gauge_param that we can modify without perturbing
   // the original one
-  if (!gauge_param_)
-    errorQuda("Input gauge_param is null");
+  if (!gauge_param_) errorQuda("Input gauge_param is null");
   QudaGaugeParam gauge_param = *gauge_param_;
 
   if (num_sub_partition == 1) { // In this case we don't split the grid.
@@ -3069,8 +3068,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     } else if (milc_fatlinks) {
       is_staggered = true;
       if (param->dslash_type == QUDA_ASQTAD_DSLASH) {
-        if (!milc_longlinks)
-          errorQuda("milc_longlinks is null for an asqtad dslash");
+        if (!milc_longlinks) errorQuda("milc_longlinks is null for an asqtad dslash");
         is_asqtad = true;
       }
     } else {
@@ -3239,7 +3237,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       loadGaugeQuda(collected_gauge.raw_pointer(), &gauge_param);
     } else {
       loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field.raw_pointer(),
-        (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr);
+                           (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr);
     }
     logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n");
 
diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index 1d586ee124..742e026bd2 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -216,14 +216,13 @@ namespace quda {
     bool converged = convergence(r2, heavy_quark_res, stop, param.tol_hq);
 
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
-      printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n",
-                 blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p),
-                 blas::norm2(r0), blas::norm2(t));
+      printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", blas::norm2(x), blas::norm2(r_sloppy),
+                 blas::norm2(v), blas::norm2(p), blas::norm2(r0), blas::norm2(t));
 
     // track if we just performed an exact recalculation of y, r, r2
     bool just_updated = false;
 
-    while ( !converged && k < param.maxiter) {
+    while (!converged && k < param.maxiter) {
       just_updated = false;
 
       matSloppy(v, p);
@@ -253,7 +252,7 @@ namespace quda {
         double s2 = blas::norm2(r_sloppy);
         Complex r0t = blas::cDotProduct(r0, t);
         beta = -r0t / r0v;
-        r2 = s2 - real(omega * conj(tr)) ;
+        r2 = s2 - real(omega * conj(tr));
         // now we can work out if we need to do a reliable update
         updateR = reliable(rNorm, maxrx, maxrr, r2, delta);
       } else {
@@ -263,24 +262,24 @@ namespace quda {
       }
 
       if (param.pipeline && !updateR) {
-        //x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p
+        // x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p
         blas::caxpbypzYmbw(alpha, p, omega, r_sloppy, x_sloppy, t);
-        blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p);
-        //tripleBiCGstabUpdate(alpha, p, omega, r_sloppy, x_sloppy, t, -beta*omega, v, beta, p
+        blas::cxpaypbz(r_sloppy, -beta * omega, v, beta, p);
+        // tripleBiCGstabUpdate(alpha, p, omega, r_sloppy, x_sloppy, t, -beta*omega, v, beta, p
       } else {
-        //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
+        // x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
         rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, r_sloppy, x_sloppy, t, r0);
         rho0 = rho;
         rho = Complex(rho_r2.x, rho_r2.y);
         r2 = rho_r2.z;
       }
 
-      if (use_heavy_quark_res && k % heavy_quark_check==0) {
+      if (use_heavy_quark_res && k % heavy_quark_check == 0) {
         if (&x != &x_sloppy) {
-           heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x_sloppy, r_sloppy).z);
+          heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x_sloppy, r_sloppy).z);
         } else {
-           blas::copy(r, r_sloppy);
-           heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z);
+          blas::copy(r, r_sloppy);
+          heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z);
         }
       }
 
@@ -309,7 +308,7 @@ namespace quda {
         rNorm = sqrt(r2);
         maxrr = rNorm;
         maxrx = rNorm;
-        //r0Norm = rNorm;
+        // r0Norm = rNorm;
         rUpdate++;
 
         just_updated = true;
@@ -319,9 +318,8 @@ namespace quda {
 
       PrintStats("BiCGstab", k, r2, b2, heavy_quark_res);
       if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
-        printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n",
-          blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p),
-          blas::norm2(r0), blas::norm2(t));
+        printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", blas::norm2(x), blas::norm2(r_sloppy),
+                   blas::norm2(v), blas::norm2(p), blas::norm2(r0), blas::norm2(t));
 
       converged = convergence(r2, heavy_quark_res, stop, param.tol_hq);
 
@@ -347,7 +345,7 @@ namespace quda {
           rNorm = sqrt(r2);
           maxrr = rNorm;
           maxrx = rNorm;
-          //r0Norm = rNorm;
+          // r0Norm = rNorm;
           rUpdate++;
 
           just_updated = true;
@@ -362,9 +360,11 @@ namespace quda {
 
       // update p
       if ((!param.pipeline || updateR) && !converged) { // need to update if not pipeline or did a reliable update
-        if (abs(rho*alpha) == 0.0) beta = 0.0;
-        else beta = (rho/rho0) * (alpha/omega);
-        blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p);
+        if (abs(rho * alpha) == 0.0)
+          beta = 0.0;
+        else
+          beta = (rho / rho0) * (alpha / omega);
+        blas::cxpaypbz(r_sloppy, -beta * omega, v, beta, p);
       }
     }
 
diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu
index 4ac97bc9fb..eadd519c5b 100644
--- a/lib/spinor_dilute.in.cu
+++ b/lib/spinor_dilute.in.cu
@@ -90,7 +90,8 @@ namespace quda
       if constexpr (Nc <= 32) {
         SpinorDilute<real, Ns, Nc>(src, v, type, local_block);
       } else {
-        errorQuda("nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)");
+        errorQuda(
+          "nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)");
       }
     } else {
       if constexpr (sizeof...(N) > 0)
diff --git a/tests/hisq_stencil_ctest.cpp b/tests/hisq_stencil_ctest.cpp
index 55186c6a5f..6df6d1d977 100644
--- a/tests/hisq_stencil_ctest.cpp
+++ b/tests/hisq_stencil_ctest.cpp
@@ -22,30 +22,31 @@ class HisqStencilTest : public ::testing::TestWithParam<::testing::tuple<QudaPre
     QudaPrecision precision = static_cast<QudaPrecision>(::testing::get<0>(GetParam()));
     QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
 
-    if ((QUDA_PRECISION & precision) == 0
-        || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0)
-      return true;
+    if ((QUDA_PRECISION & precision) == 0 || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) return true;
 
-    const std::array<bool, 16> partition_enabled {true, true, true,  false,  true,  false, false, false,
-                                                  true, false, false, false, true, false, true, true};
+    const std::array<bool, 16> partition_enabled {true, true,  true,  false, true, false, false, false,
+                                                  true, false, false, false, true, false, true,  true};
     if (!ctest_all_partitions && !partition_enabled[::testing::get<3>(GetParam())]) return true;
 
     return false;
   }
 
-  void display_test_info(QudaPrecision prec, QudaReconstructType link_recon, bool has_naik) {
+  void display_test_info(QudaPrecision prec, QudaReconstructType link_recon, bool has_naik)
+  {
     printfQuda("running the following test:\n");
-    printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       Ordering\n");
+    printfQuda(
+      "link_precision           link_reconstruct           space_dimension        T_dimension       Ordering\n");
     printfQuda("%s                       %s                         %d/%d/%d/                  %d             %s \n",
-              get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
+               get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
     printfQuda("Grid partition info:     X  Y  Z  T\n");
     printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
-              dimPartitioned(3));
+               dimPartitioned(3));
     printfQuda("Number of Naiks: %d\n", has_naik ? 2 : 1);
   }
 
 public:
-  virtual void SetUp() {
+  virtual void SetUp()
+  {
     QudaPrecision prec = static_cast<QudaPrecision>(::testing::get<0>(GetParam()));
     QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
     bool has_naik = ::testing::get<2>(GetParam());
@@ -62,14 +63,13 @@ class HisqStencilTest : public ::testing::TestWithParam<::testing::tuple<QudaPre
     display_test_info(prec, recon, has_naik);
   }
 
-  virtual void TearDown() {
+  virtual void TearDown()
+  {
     if (skip()) GTEST_SKIP();
     hisq_stencil_test_wrapper.end();
   }
 
-  static void SetUpTestCase() {
-    initQuda(device_ordinal);
-  }
+  static void SetUpTestCase() { initQuda(device_ordinal); }
 
   // Per-test-case tear-down.
   // Called after the last test in this test case.
@@ -81,10 +81,7 @@ class HisqStencilTest : public ::testing::TestWithParam<::testing::tuple<QudaPre
   }
 };
 
-TEST_P(HisqStencilTest, benchmark)
-{
-  hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true);
-}
+TEST_P(HisqStencilTest, benchmark) { hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true); }
 
 TEST_P(HisqStencilTest, verify)
 {
@@ -147,7 +144,8 @@ int main(int argc, char **argv)
   return test_rc;
 }
 
-std::string gethisqstenciltestname(testing::TestParamInfo<::testing::tuple<QudaPrecision, QudaReconstructType, bool, int>> param)
+std::string
+gethisqstenciltestname(testing::TestParamInfo<::testing::tuple<QudaPrecision, QudaReconstructType, bool, int>> param)
 {
   const QudaPrecision prec = static_cast<QudaPrecision>(::testing::get<0>(param.param));
   const QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(param.param));
@@ -165,15 +163,11 @@ std::string gethisqstenciltestname(testing::TestParamInfo<::testing::tuple<QudaP
 #ifdef MULTI_GPU
 INSTANTIATE_TEST_SUITE_P(QUDA, HisqStencilTest,
                          Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION),
-                                 ::testing::Values(QUDA_RECONSTRUCT_NO),
-                                 ::testing::Bool(),
-                                 Range(0, 16)),
+                                 ::testing::Values(QUDA_RECONSTRUCT_NO), ::testing::Bool(), Range(0, 16)),
                          gethisqstenciltestname);
 #else
 INSTANTIATE_TEST_SUITE_P(QUDA, HisqStencilTest,
                          Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION),
-                                 ::testing::Values(QUDA_RECONSTRUCT_NO),
-                                 ::testing::Bool(),
-                                 ::testing::Values(0)),
+                                 ::testing::Values(QUDA_RECONSTRUCT_NO), ::testing::Bool(), ::testing::Values(0)),
                          gethisqstenciltestname);
 #endif
diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp
index e42dc7b804..3b20287d3b 100644
--- a/tests/hisq_stencil_test.cpp
+++ b/tests/hisq_stencil_test.cpp
@@ -7,30 +7,29 @@ class HisqStencilTest : public ::testing::Test
 protected:
   HisqStencilTestWrapper hisq_stencil_test_wrapper;
 
-  void display_test_info() {
+  void display_test_info()
+  {
     printfQuda("running the following test:\n");
-    printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       Ordering\n");
+    printfQuda(
+      "link_precision           link_reconstruct           space_dimension        T_dimension       Ordering\n");
     printfQuda("%s                       %s                         %d/%d/%d/                  %d             %s \n",
-              get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
+               get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
     printfQuda("Grid partition info:     X  Y  Z  T\n");
     printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
-              dimPartitioned(3));
+               dimPartitioned(3));
     printfQuda("Number of Naiks: %d\n", n_naiks);
   }
 
 public:
-  virtual void SetUp() {
+  virtual void SetUp()
+  {
     hisq_stencil_test_wrapper.init_test();
     display_test_info();
   }
 
-  virtual void TearDown() {
-    hisq_stencil_test_wrapper.end();
-  }
+  virtual void TearDown() { hisq_stencil_test_wrapper.end(); }
 
-  static void SetUpTestCase() {
-    initQuda(device_ordinal);
-  }
+  static void SetUpTestCase() { initQuda(device_ordinal); }
 
   // Per-test-case tear-down.
   // Called after the last test in this test case.
@@ -42,10 +41,7 @@ class HisqStencilTest : public ::testing::Test
   }
 };
 
-TEST_F(HisqStencilTest, benchmark)
-{
-  hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true);
-}
+TEST_F(HisqStencilTest, benchmark) { hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true); }
 
 TEST_F(HisqStencilTest, verify)
 {
diff --git a/tests/hisq_stencil_test_utils.h b/tests/hisq_stencil_test_utils.h
index 4c1f3cfcc3..d340c0edda 100644
--- a/tests/hisq_stencil_test_utils.h
+++ b/tests/hisq_stencil_test_utils.h
@@ -41,7 +41,7 @@ struct HisqStencilTestWrapper {
   static inline std::array<std::array<double, 6>, 3> act_paths;
 
   // initial links in MILC order
-  static inline void* milc_sitelink = nullptr;
+  static inline void *milc_sitelink = nullptr;
 
   // storage for CPU reference fat and long links w/zero Naik
   static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr};
@@ -69,7 +69,8 @@ struct HisqStencilTestWrapper {
   static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
   static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr};
 
-  void set_naik(bool has_naik) {
+  void set_naik(bool has_naik)
+  {
     if (has_naik) {
       eps_naik = -0.03; // semi-arbitrary
       n_naiks = 2;
@@ -79,7 +80,8 @@ struct HisqStencilTestWrapper {
     }
   }
 
-  void init_ctest(QudaPrecision prec_, QudaReconstructType link_recon_, bool has_naik) {
+  void init_ctest(QudaPrecision prec_, QudaReconstructType link_recon_, bool has_naik)
+  {
     prec = prec_;
     link_recon = link_recon_;
 
@@ -101,7 +103,8 @@ struct HisqStencilTestWrapper {
     init();
   }
 
-  void init_test() {
+  void init_test()
+  {
     gauge_param = newQudaGaugeParam();
     setStaggeredGaugeParam(gauge_param);
 
@@ -113,7 +116,8 @@ struct HisqStencilTestWrapper {
     init();
   }
 
-  void init_host() {
+  void init_host()
+  {
     setDims(gauge_param.X);
     dw_setDims(gauge_param.X, 1);
 
@@ -142,12 +146,12 @@ struct HisqStencilTestWrapper {
     // Second path: create X, long links
     act_paths[1] = {
       ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */
-                                                        /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
-      (-1.0 / 24.0),                                    /* Naik */
-      (-1.0 / 8.0) * 0.5,                               /* simple staple */
-      (1.0 / 8.0) * 0.25 * 0.5,                         /* displace link in two directions */
-      (-1.0 / 8.0) * 0.125 * (1.0 / 6.0),               /* displace link in three directions */
-      (-2.0 / 16.0)                                     /* Lepage term, correct O(a^2) 2x ASQTAD */
+      /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
+      (-1.0 / 24.0),                      /* Naik */
+      (-1.0 / 8.0) * 0.5,                 /* simple staple */
+      (1.0 / 8.0) * 0.25 * 0.5,           /* displace link in two directions */
+      (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */
+      (-2.0 / 16.0)                       /* Lepage term, correct O(a^2) 2x ASQTAD */
     };
 
     // Paths for epsilon corrections. Not used if n_naiks = 1.
@@ -165,7 +169,7 @@ struct HisqStencilTestWrapper {
     ////////////////////////////////////
 
     setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
-                              svd_abs_error);
+                               svd_abs_error);
 
     /////////////////
     // Input links //
@@ -214,7 +218,8 @@ struct HisqStencilTestWrapper {
 #endif
   }
 
-  void init() {
+  void init()
+  {
 
     // reset the reconstruct in gauge param
     gauge_param.reconstruct = link_recon;
@@ -245,7 +250,8 @@ struct HisqStencilTestWrapper {
     }
   }
 
-  static void end() {
+  static void end()
+  {
     if (milc_sitelink) host_free(milc_sitelink);
 
     // Clean up GPU compute links
@@ -262,7 +268,8 @@ struct HisqStencilTestWrapper {
     freeGaugeQuda();
   }
 
-  static void destroy() {
+  static void destroy()
+  {
 
     for (int i = 0; i < 4; i++) {
       host_free(fat_reflink[i]);
@@ -292,7 +299,8 @@ struct HisqStencilTestWrapper {
   // X -- after 2nd level of smearing, non-SU(3)
   /*--------------------------------------------------------------------*/
 
-  double llfatCUDA(int niter) {
+  double llfatCUDA(int niter)
+  {
     host_timer_t host_timer;
 
     comm_barrier();
@@ -337,7 +345,8 @@ struct HisqStencilTestWrapper {
     return host_timer.last();
   }
 
-  void run_test(int niter, bool print_metrics = false) {
+  void run_test(int niter, bool print_metrics = false)
+  {
     //////////////////////
     // Perform GPU test //
     //////////////////////
@@ -357,26 +366,26 @@ struct HisqStencilTestWrapper {
     if (print_metrics) {
       // FIXME: does not include unitarization, extra naiks
       int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3];
-      //long long flops = 61632 * (long long)niter; // Constructing V field
+      // long long flops = 61632 * (long long)niter; // Constructing V field
       // Constructing W field?
       // Constructing separate Naiks
-      //flops += 61632 * (long long)niter;     // Constructing X field
-      //flops += (252 * 4) * (long long)niter; // long-link contribution
+      // flops += 61632 * (long long)niter;     // Constructing X field
+      // flops += (252 * 4) * (long long)niter; // long-link contribution
 
       printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter);
 
       printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter,
-                    (flops / niter) / volume, (bytes / niter) / volume);
+                 (flops / niter) / volume, (bytes / niter) / volume);
 
       double gflops = 1.0e-9 * flops / secs;
-        printfQuda("GFLOPS = %f\n", gflops);
+      printfQuda("GFLOPS = %f\n", gflops);
 
       double gbytes = 1.0e-9 * bytes / secs;
-        printfQuda("GBYTES = %f\n", gbytes);
+      printfQuda("GBYTES = %f\n", gbytes);
 
       // Old metric
-      //double perf = flops / (secs * 1024 * 1024 * 1024);
-      //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf);
+      // double perf = flops / (secs * 1024 * 1024 * 1024);
+      // printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf);
     }
   }
 
@@ -407,8 +416,8 @@ struct HisqStencilTestWrapper {
     if (n_naiks > 1) {
       for (int dir = 0; dir < 4; dir++) {
         res[0] = std::max(res[0],
-          compare_floats_v2(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, max_dev,
-                            gauge_param.cpu_prec));
+                          compare_floats_v2(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, max_dev,
+                                            gauge_param.cpu_prec));
       }
 
       strong_check_link(qdp_fatlink_eps, "Fat link GPU results: ", fat_reflink_eps, "CPU reference results:", V,
@@ -416,32 +425,35 @@ struct HisqStencilTestWrapper {
 
       for (int dir = 0; dir < 4; ++dir) {
         res[1] = std::max(res[1],
-          compare_floats_v2(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, max_dev,
-                            gauge_param.cpu_prec));
+                          compare_floats_v2(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, max_dev,
+                                            gauge_param.cpu_prec));
       }
 
       strong_check_link(qdp_longlink_eps, "Long link GPU results: ", long_reflink_eps, "CPU reference results:", V,
                         gauge_param.cpu_prec);
     } else {
       for (int dir = 0; dir < 4; dir++) {
-        res[0] = std::max(res[0],
+        res[0] = std::max(
+          res[0],
           compare_floats_v2(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec));
       }
 
-      strong_check_link(qdp_fatlink, "Fat link GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
+      strong_check_link(qdp_fatlink, "Fat link GPU results: ", fat_reflink, "CPU reference results:", V,
+                        gauge_param.cpu_prec);
 
       for (int dir = 0; dir < 4; ++dir) {
-        res[1] = std::max(res[1],
+        res[1] = std::max(
+          res[1],
           compare_floats_v2(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec));
       }
 
-      strong_check_link(qdp_longlink, "Long link GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec);
+      strong_check_link(qdp_longlink, "Long link GPU results: ", long_reflink, "CPU reference results:", V,
+                        gauge_param.cpu_prec);
     }
 
     printfQuda("Fat link test %s\n", (res[0] < max_dev) ? "PASSED" : "FAILED");
     printfQuda("Long link test %s\n", (res[1] < max_dev) ? "PASSED" : "FAILED");
 
     return res;
-
   }
 };
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 534d0b2a1b..bb7efa83a4 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -743,15 +743,19 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
   return l2r;
 }
 
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link,
-                                quda::GaugeField &long_link, QudaInvertParam &inv_param) {
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out,
+                                               quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                               QudaInvertParam &inv_param)
+{
   std::vector<quda::ColorSpinorField> out_vector(1);
   out_vector[0] = out;
   return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param);
 }
 
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector<quda::ColorSpinorField> &out_vector,
-                                               quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param)
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
+                                               std::vector<quda::ColorSpinorField> &out_vector,
+                                               quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                               QudaInvertParam &inv_param)
 {
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   double l2r_max = 0.0;
@@ -762,8 +766,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
   quda::ColorSpinorField ref(csParam);
 
   if (multishift > 1) {
-    if (dslash_type == QUDA_LAPLACE_DSLASH)
-      errorQuda("Multishift solves do not support the laplace operator (yet)");
+    if (dslash_type == QUDA_LAPLACE_DSLASH) errorQuda("Multishift solves do not support the laplace operator (yet)");
 
     if (inv_param.solution_type != QUDA_MATPC_SOLUTION)
       errorQuda("Invalid staggered multishift solution type %d, expected QUDA_MATPC_SOLUTION", inv_param.solution_type);
@@ -771,13 +774,13 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
     // Check the mat_pc type and make sure it's sane
     QudaParity parity = QUDA_INVALID_PARITY;
     switch (inv_param.matpc_type) {
-      case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
-      case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
-      default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
+    case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
+    case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
+    default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
     }
 
     for (int i = 0; i < multishift; i++) {
-      auto& out = out_vector[i];
+      auto &out = out_vector[i];
       double mass = 0.5 * sqrt(inv_param.offset[i]);
       stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
 
@@ -789,9 +792,9 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
 
       printfQuda("%dth solution: mass=%f, ", i, mass);
       printfQuda("Shift %2d residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, "
-                "QUDA = %9.6e, host = %9.6e\n",
-                i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r,
-                inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr);
+                 "QUDA = %9.6e, host = %9.6e\n",
+                 i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i],
+                 inv_param.true_res_hq_offset[i], hqr);
       // Empirical: if the cpu residue is more than 1 order the target accuracy, then it fails to converge
       if (sqrt(nrm2 / src2) > 10 * inv_param.tol_offset[i]) {
         printfQuda("Shift %2d has empirically failed to converge\n", i);
@@ -802,20 +805,19 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
     }
 
   } else {
-    auto& out = out_vector[0];
+    auto &out = out_vector[0];
     double mass = inv_param.mass;
     if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
       stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
 
       // correct for the massRescale function inside invertQuda
-      if (is_laplace(dslash_type))
-        ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
+      if (is_laplace(dslash_type)) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
     } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
       QudaParity parity = QUDA_INVALID_PARITY;
       switch (inv_param.matpc_type) {
-        case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
-        case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
-        default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
+      case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
+      case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
+      default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
       }
       stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
     } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
@@ -831,8 +833,8 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
     double l2r = sqrt(nrm2 / src2);
 
     printfQuda("Residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, QUDA = %9.6e, "
-                "host = %9.6e\n",
-                inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);
+               "host = %9.6e\n",
+               inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);
 
     l2r_max = l2r;
     hqr_max = hqr;
@@ -841,10 +843,10 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::
   return {l2r_max, hqr_max};
 }
 
-double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i,
-                                   QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link)
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i,
+                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link)
 {
-  QudaInvertParam& inv_param = *(eig_param.invert_param);
+  QudaInvertParam &inv_param = *(eig_param.invert_param);
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false);
   bool normop = (eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? true : false);
@@ -853,11 +855,15 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co
   // Reverse engineer a "solution_type" to help determine which host dslash needs to be applied
   QudaSolutionType sol_type = QUDA_INVALID_SOLUTION;
   if (normop) {
-    if (use_pc) errorQuda("The normal preconditioned staggered op is not supported");
-    else sol_type = QUDA_MATDAG_MAT_SOLUTION;
+    if (use_pc)
+      errorQuda("The normal preconditioned staggered op is not supported");
+    else
+      sol_type = QUDA_MATDAG_MAT_SOLUTION;
   } else {
-    if (use_pc) sol_type = QUDA_MATPC_SOLUTION;
-    else sol_type = QUDA_MAT_SOLUTION;
+    if (use_pc)
+      sol_type = QUDA_MATPC_SOLUTION;
+    else
+      sol_type = QUDA_MAT_SOLUTION;
   }
 
   // Create temporary spinors
@@ -869,9 +875,9 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co
   } else if (sol_type == QUDA_MATPC_SOLUTION) {
     QudaParity parity = QUDA_INVALID_PARITY;
     switch (inv_param.matpc_type) {
-      case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
-      case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
-      default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
+    case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break;
+    case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
+    default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
     }
     stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type);
   } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) {
@@ -889,16 +895,16 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co
   return l2r;
 }
 
-double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i,
-                                         QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link)
+double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right,
+                                         double _Complex sigma, int i, QudaEigParam &eig_param,
+                                         quda::GaugeField &fat_link, quda::GaugeField &long_link)
 {
-  QudaInvertParam& inv_param = *(eig_param.invert_param);
+  QudaInvertParam &inv_param = *(eig_param.invert_param);
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false);
   double mass = inv_param.mass;
 
-  if (use_pc)
-    errorQuda("The SVD of the preconditioned staggered op is not supported");
+  if (use_pc) errorQuda("The SVD of the preconditioned staggered op is not supported");
 
   // Create temporary spinors
   quda::ColorSpinorParam csParam(spinor_left);
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 0388b2a10d..c464836f71 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -110,60 +110,64 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
 /**
-  * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes
-  *        an array of outputs as is necessary for handling both single- and multi-shift solves.
-  *
-  * @param in The initial rhs
-  * @param out The solution to A out = in
-  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
-  * @param long_link The long links; null for naive staggered and Laplace
-  * @param inv_param Invert params, used to query the solve type, etc
-  * @return The residual and HQ residual (if requested)
-  */
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link,
-                                quda::GaugeField &long_link, QudaInvertParam &inv_param);
+ * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes
+ *        an array of outputs as is necessary for handling both single- and multi-shift solves.
+ *
+ * @param in The initial rhs
+ * @param out The solution to A out = in
+ * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+ * @param long_link The long links; null for naive staggered and Laplace
+ * @param inv_param Invert params, used to query the solve type, etc
+ * @return The residual and HQ residual (if requested)
+ */
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out,
+                                               quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                               QudaInvertParam &inv_param);
 
 /**
-  * @brief Verify a single- or multi-shift staggered inversion on the host
-  *
-  * @param in The initial rhs
-  * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve
-  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
-  * @param long_link The long links; null for naive staggered and Laplace
-  * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts
-  * @return The residual and HQ residual (if requested)
-  */
-std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector<quda::ColorSpinorField> &out_vector,
-                                quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param);
+ * @brief Verify a single- or multi-shift staggered inversion on the host
+ *
+ * @param in The initial rhs
+ * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve
+ * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+ * @param long_link The long links; null for naive staggered and Laplace
+ * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts
+ * @return The residual and HQ residual (if requested)
+ */
+std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
+                                               std::vector<quda::ColorSpinorField> &out_vector,
+                                               quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                               QudaInvertParam &inv_param);
 
 /**
-  * @brief Verify a staggered-type eigenvector
-  *
-  * @param spinor The host eigenvector to be verified
-  * @param lambda The host eigenvalue to be verified
-  * @param i The number of the eigenvalue, only used when printing outputs
-  * @param eig_param Eigensolve params, used to query the operator type, etc
-  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
-  * @param long_link The long links; null for naive staggered and Laplace
-  * @return The residual norm
-  */
-double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i,
+ * @brief Verify a staggered-type eigenvector
+ *
+ * @param spinor The host eigenvector to be verified
+ * @param lambda The host eigenvalue to be verified
+ * @param i The number of the eigenvalue, only used when printing outputs
+ * @param eig_param Eigensolve params, used to query the operator type, etc
+ * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+ * @param long_link The long links; null for naive staggered and Laplace
+ * @return The residual norm
+ */
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i,
                                       QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
 
 /**
-  * @brief Verify a staggered-type singular vector
-  *
-  * @param spinor The host left singular vector to be verified
-  * @param spinor_right The host right singular vector to be verified
-  * @param lambda The host singular value to be verified
-  * @param i The number of the singular value, only used when printing outputs
-  * @param eig_param Eigensolve params, used to query the operator type, etc
-  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
-  * @param long_link The long links; null for naive staggered and Laplace
-  * @return The residual norm
-  */
-double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i,
-                                         QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
+ * @brief Verify a staggered-type singular vector
+ *
+ * @param spinor The host left singular vector to be verified
+ * @param spinor_right The host right singular vector to be verified
+ * @param lambda The host singular value to be verified
+ * @param i The number of the singular value, only used when printing outputs
+ * @param eig_param Eigensolve params, used to query the operator type, etc
+ * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
+ * @param long_link The long links; null for naive staggered and Laplace
+ * @return The residual norm
+ */
+double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right,
+                                         double _Complex sigma, int i, QudaEigParam &eig_param,
+                                         quda::GaugeField &fat_link, quda::GaugeField &long_link);
 
 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 610f81a0b4..bf6bcf8b92 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -129,7 +129,9 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
                  const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
+  if (in.Precision() != fat_link.Precision()) {
+    errorQuda("The spinor precision and gauge precision are not the same");
+  }
 
   // assert we have single-parity spinors
   if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET)
@@ -151,27 +153,21 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
   void *qdp_longlink[] = {long_link.data(0), long_link.data(1), long_link.data(2), long_link.data(3)};
   void *ghost_fatlink[]
     = {fat_link.Ghost()[0].data(), fat_link.Ghost()[1].data(), fat_link.Ghost()[2].data(), fat_link.Ghost()[3].data()};
-  void *ghost_longlink[]
-    = {long_link.Ghost()[0].data(), long_link.Ghost()[1].data(), long_link.Ghost()[2].data(), long_link.Ghost()[3].data()};
+  void *ghost_longlink[] = {long_link.Ghost()[0].data(), long_link.Ghost()[1].data(), long_link.Ghost()[2].data(),
+                            long_link.Ghost()[3].data()};
 
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
-    staggeredDslashReference(static_cast<double*>(out.data()),
-                             reinterpret_cast<double**>(qdp_fatlink),
-                             reinterpret_cast<double**>(qdp_longlink),
-                             reinterpret_cast<double**>(ghost_fatlink),
-                             reinterpret_cast<double**>(ghost_longlink),
-                             static_cast<double*>(in.data()),
-                             reinterpret_cast<double**>(in.fwdGhostFaceBuffer),
-                             reinterpret_cast<double**>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
+    staggeredDslashReference(static_cast<double *>(out.data()), reinterpret_cast<double **>(qdp_fatlink),
+                             reinterpret_cast<double **>(qdp_longlink), reinterpret_cast<double **>(ghost_fatlink),
+                             reinterpret_cast<double **>(ghost_longlink), static_cast<double *>(in.data()),
+                             reinterpret_cast<double **>(in.fwdGhostFaceBuffer),
+                             reinterpret_cast<double **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
   } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
-    staggeredDslashReference(static_cast<float*>(out.data()),
-                             reinterpret_cast<float**>(qdp_fatlink),
-                             reinterpret_cast<float**>(qdp_longlink),
-                             reinterpret_cast<float**>(ghost_fatlink),
-                             reinterpret_cast<float**>(ghost_longlink),
-                             static_cast<float*>(in.data()),
-                             reinterpret_cast<float**>(in.fwdGhostFaceBuffer),
-                             reinterpret_cast<float**>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
+    staggeredDslashReference(static_cast<float *>(out.data()), reinterpret_cast<float **>(qdp_fatlink),
+                             reinterpret_cast<float **>(qdp_longlink), reinterpret_cast<float **>(ghost_fatlink),
+                             reinterpret_cast<float **>(ghost_longlink), static_cast<float *>(in.data()),
+                             reinterpret_cast<float **>(in.fwdGhostFaceBuffer),
+                             reinterpret_cast<float **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
   }
 }
 
@@ -179,7 +175,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
               const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
+  if (in.Precision() != fat_link.Precision()) {
+    errorQuda("The spinor precision and gauge precision are not the same");
+  }
 
   // assert we have full-parity spinors
   if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
@@ -201,10 +199,12 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
 }
 
 void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
+                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); }
+  if (in.Precision() != fat_link.Precision()) {
+    errorQuda("The spinor precision and gauge precision are not the same");
+  }
 
   // assert we have full-parity spinors
   if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
@@ -219,8 +219,8 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga
   stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type);
 }
 
-void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int,
-                QudaParity parity, QudaDslashType dslash_type)
+void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+                const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type)
 {
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); }
@@ -248,8 +248,9 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi
 
   double msq_x4 = mass * mass * 4;
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
-    axmy(static_cast<double*>(in.data()), msq_x4, static_cast<double*>(out.data()), Vh * stag_spinor_site_size);
+    axmy(static_cast<double *>(in.data()), msq_x4, static_cast<double *>(out.data()), Vh * stag_spinor_site_size);
   } else {
-    axmy(static_cast<float*>(in.data()), static_cast<float>(msq_x4), static_cast<float*>(out.data()), Vh * stag_spinor_site_size);
+    axmy(static_cast<float *>(in.data()), static_cast<float>(msq_x4), static_cast<float *>(out.data()),
+         Vh * stag_spinor_site_size);
   }
 }
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index c5b73d980b..b39287bfb1 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -12,79 +12,79 @@ using namespace quda;
 void setDims(int *);
 
 /**
-  * @brief Base host routine to apply the even-odd or odd-even component of a staggered-type dslash
-  *
-  * @tparam real_t Datatype used in the host dslash
-  * @param res Host output result
-  * @param fatlink Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
-  * @param longlink Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
-  * @param ghostFatlink Ghost zones for the host fat links
-  * @param ghostLonglink Ghost zones for the host long links
-  * @param spinorField Host input spinor
-  * @param fwd_nbr_spinor Forward ghost zones for the host input spinor
-  * @param back_nbr_spinor Backwards ghost zones for the host input spinor
-  * @param oddBit 0 for D_eo, 1 for D_oe
-  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
-  * @param dslash_type Dslash type
-  */
+ * @brief Base host routine to apply the even-odd or odd-even component of a staggered-type dslash
+ *
+ * @tparam real_t Datatype used in the host dslash
+ * @param res Host output result
+ * @param fatlink Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+ * @param longlink Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+ * @param ghostFatlink Ghost zones for the host fat links
+ * @param ghostLonglink Ghost zones for the host long links
+ * @param spinorField Host input spinor
+ * @param fwd_nbr_spinor Forward ghost zones for the host input spinor
+ * @param back_nbr_spinor Backwards ghost zones for the host input spinor
+ * @param oddBit 0 for D_eo, 1 for D_oe
+ * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+ * @param dslash_type Dslash type
+ */
 template <typename real_t>
 void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink,
                               real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor,
                               real_t **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type);
 
 /**
-  * @brief Apply even-odd or odd-even component of a staggered-type dslash
-  *
-  * @param out Host output rhs
-  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
-  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
-  * @param in Host input spinor
-  * @param oddBit 0 for D_eo, 1 for D_oe
-  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
-  * @param dslash_type Dslash type
-  */
-void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
-                 int oddBit, int daggerBit, QudaDslashType dslash_type);
+ * @brief Apply even-odd or odd-even component of a staggered-type dslash
+ *
+ * @param out Host output rhs
+ * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+ * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+ * @param in Host input spinor
+ * @param oddBit 0 for D_eo, 1 for D_oe
+ * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+ * @param dslash_type Dslash type
+ */
+void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type);
 
 /**
-  * @brief Apply the full parity staggered-type dslash
-  *
-  * @param out Host output rhs
-  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
-  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
-  * @param in Host input spinor
-  * @param mass Mass for the dslash operator
-  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
-  * @param dslash_type Dslash type
-  */
-void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
-              double mass, int daggerBit, QudaDslashType dslash_type);
+ * @brief Apply the full parity staggered-type dslash
+ *
+ * @param out Host output rhs
+ * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+ * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+ * @param in Host input spinor
+ * @param mass Mass for the dslash operator
+ * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+ * @param dslash_type Dslash type
+ */
+void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type);
 
 /**
-  * @brief Apply the full parity staggered-type matdag_mat
-  *
-  * @param out Host output rhs
-  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
-  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
-  * @param in Host input spinor
-  * @param mass Mass for the dslash operator
-  * @param daggerBit 0 for the regular operator, 1 for the dagger operator
-  * @param dslash_type Dslash type
-  */
-void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
-              double mass, int daggerBit, QudaDslashType dslash_type);
+ * @brief Apply the full parity staggered-type matdag_mat
+ *
+ * @param out Host output rhs
+ * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+ * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+ * @param in Host input spinor
+ * @param mass Mass for the dslash operator
+ * @param daggerBit 0 for the regular operator, 1 for the dagger operator
+ * @param dslash_type Dslash type
+ */
+void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type);
 
 /**
-  * @brief Apply the even-even or odd-odd preconditioned staggered dslash
-  *
-  * @param out Host output rhs
-  * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
-  * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
-  * @param in Host input spinor
-  * @param mass Mass for the dslash operator
-  * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator
-  * @param parity Parity of preconditioned dslash
-  * @param dslash_type Dslash type
-  */
-void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in,
-                double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type);
+ * @brief Apply the even-even or odd-odd preconditioned staggered dslash
+ *
+ * @param out Host output rhs
+ * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash
+ * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash
+ * @param in Host input spinor
+ * @param mass Mass for the dslash operator
+ * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator
+ * @param parity Parity of preconditioned dslash
+ * @param dslash_type Dslash type
+ */
+void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
+                const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type);
diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index dca4bc5e9a..55c1c3f788 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -69,9 +69,7 @@ TEST_P(InvertTest, verify)
   if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq;
 
   auto tol = inv_param.tol;
-  if (is_chiral(inv_param.dslash_type)) {
-    tol *= std::sqrt(static_cast<double>(inv_param.Ls));
-  }
+  if (is_chiral(inv_param.dslash_type)) { tol *= std::sqrt(static_cast<double>(inv_param.Ls)); }
   // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this
   if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50;
   // Slight loss of precision possible when reconstructing full solution
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index 2d5311632a..65edd69124 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -126,10 +126,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type))
-      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (!is_staggered(dslash_type))
-      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index 7905d39db6..82c84c3225 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -52,8 +52,8 @@ TEST_F(StaggeredDslashTest, verify)
   double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec);
 
   // give it a tiny bump for fixed precision, recon 8
-  if (dslash_test_wrapper.inv_param.cuda_prec <= QUDA_HALF_PRECISION &&
-      dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9)
+  if (dslash_test_wrapper.inv_param.cuda_prec <= QUDA_HALF_PRECISION
+      && dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9)
     tol *= 1.1;
 
   ASSERT_LE(deviation, tol) << "reference and QUDA implementations do not agree";
@@ -89,10 +89,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type))
-      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (!is_staggered(dslash_type))
-      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
   // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash,
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 810c045863..0a3d589ca1 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -81,15 +81,9 @@ struct StaggeredDslashTestWrapper {
     // compare to dslash reference implementation
     printfQuda("Calculating reference implementation...");
     switch (dtest_type) {
-    case dslash_test_type::Dslash:
-      stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type);
-      break;
-    case dslash_test_type::MatPC:
-      stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, parity, dslash_type);
-      break;
-    case dslash_test_type::Mat:
-      stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
-      break;
+    case dslash_test_type::Dslash: stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type); break;
+    case dslash_test_type::MatPC: stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, parity, dslash_type); break;
+    case dslash_test_type::Mat: stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); break;
     case dslash_test_type::MatDagMat:
       stag_matdag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type);
       break;
@@ -214,7 +208,6 @@ struct StaggeredDslashTestWrapper {
 
     // set verbosity prior to loadGaugeQuda
     setVerbosity(verbosity);
-
   }
 
   void init()
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index e971e0327e..6e717437fe 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -178,13 +178,12 @@ std::vector<double> eigensolve(test_t test_param)
   }
 
   logQuda(QUDA_SUMMARIZE, "Action = %s, Solver = %s, norm-op = %s, even-odd = %s, with SVD = %s, spectrum = %s\n",
-          get_dslash_str(dslash_type),
-          get_eig_type_str(eig_param.eig_type), eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? "true" : "false",
+          get_dslash_str(dslash_type), get_eig_type_str(eig_param.eig_type),
+          eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? "true" : "false",
           eig_param.use_pc == QUDA_BOOLEAN_TRUE ? "true" : "false",
           eig_param.compute_svd == QUDA_BOOLEAN_TRUE ? "true" : "false", get_eig_spectrum_str(eig_param.spectrum));
 
-  if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE))
-    display_test_info(eig_param);
+  if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) display_test_info(eig_param);
 
   // Vector construct START
   //----------------------------------------------------------------------------
@@ -228,7 +227,8 @@ std::vector<double> eigensolve(test_t test_param)
     for (int i = 0; i < eig_n_conv; i++) {
       if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) {
         double _Complex sigma = evals[i];
-        residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP, cpuLongQDP);
+        residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP,
+                                                       cpuLongQDP);
       } else {
         double _Complex lambda = evals[i];
         residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP);
@@ -277,10 +277,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type))
-      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (!is_staggered(dslash_type))
-      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
   if (eig_param.arpack_check && !(prec == QUDA_DOUBLE_PRECISION)) {
@@ -288,10 +286,11 @@ int main(int argc, char **argv)
   }
 
   // Sanity check combinations of solve type and solution type
-  if ((solve_type == QUDA_DIRECT_SOLVE && solution_type != QUDA_MAT_SOLUTION) ||
-    (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type != QUDA_MATPC_SOLUTION) ||
-    (solve_type == QUDA_NORMOP_SOLVE && solution_type != QUDA_MATDAG_MAT_SOLUTION)) {
-    errorQuda("Invalid combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type));
+  if ((solve_type == QUDA_DIRECT_SOLVE && solution_type != QUDA_MAT_SOLUTION)
+      || (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type != QUDA_MATPC_SOLUTION)
+      || (solve_type == QUDA_NORMOP_SOLVE && solution_type != QUDA_MATDAG_MAT_SOLUTION)) {
+    errorQuda("Invalid combination of solve_type %s and solution_type %s", get_solve_str(solve_type),
+              get_solution_str(solution_type));
   }
 
   initQuda(device_ordinal);
@@ -301,12 +300,24 @@ int main(int argc, char **argv)
     // the staggered tests will fail. These checks are designed to be consistent
     // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked"
     bool changes = false;
-    if (!compute_fatlong) { compute_fatlong = true; changes = true; }
+    if (!compute_fatlong) {
+      compute_fatlong = true;
+      changes = true;
+    }
 
     double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-4 : 1e-5;
-    if (eig_tol != expected_tol) { eig_tol = expected_tol; changes = true; }
-    if (niter != 1000) { niter = 1000; changes = true; }
-    if (eig_n_kr != 256) { eig_n_kr = 256; changes = true; }
+    if (eig_tol != expected_tol) {
+      eig_tol = expected_tol;
+      changes = true;
+    }
+    if (niter != 1000) {
+      niter = 1000;
+      changes = true;
+    }
+    if (eig_n_kr != 256) {
+      eig_n_kr = 256;
+      changes = true;
+    }
     if (eig_block_size != 4) { eig_block_size = 4; }
 
     if (changes) {
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index 6cf272cb5d..382510f74b 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -12,7 +12,8 @@ class StaggeredEigensolveTest : public ::testing::TestWithParam<test_t>
 };
 
 // Get the solve type that this combination corresponds to
-QudaSolveType get_solve_type(QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd) {
+QudaSolveType get_solve_type(QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd)
+{
   if (use_norm_op == QUDA_BOOLEAN_FALSE && use_pc == QUDA_BOOLEAN_TRUE && compute_svd == QUDA_BOOLEAN_FALSE)
     return QUDA_DIRECT_PC_SOLVE;
   else if (use_norm_op == QUDA_BOOLEAN_TRUE && use_pc == QUDA_BOOLEAN_FALSE && compute_svd == QUDA_BOOLEAN_TRUE)
@@ -37,8 +38,7 @@ bool skip_test(test_t test_param)
     // matpc
 
     // this is only legal for the staggered and asqtad op
-    if (!is_staggered(dslash_type))
-      return true;
+    if (!is_staggered(dslash_type)) return true;
 
     // we can only compute the real part for Lanczos, and real or magnitude for Arnoldi
     switch (eig_type) {
@@ -53,10 +53,9 @@ bool skip_test(test_t test_param)
     }
   } else if (combo_solve_type == QUDA_NORMOP_SOLVE) {
     // matdag_mat
-    
+
     // this is only legal for the staggered and asqtad op
-    if (!is_staggered(dslash_type))
-      return true;
+    if (!is_staggered(dslash_type)) return true;
 
     switch (eig_type) {
     case QUDA_EIG_TR_LANCZOS:
@@ -64,22 +63,22 @@ bool skip_test(test_t test_param)
       if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true;
       break;
     case QUDA_EIG_IR_ARNOLDI:
-      //if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true;
+      // if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true;
       return true; // we skip this because it takes an unnecessarily long time and it's covered elsewhere
       break;
     default: return true; break;
     }
   } else if (combo_solve_type == QUDA_DIRECT_SOLVE) {
     // mat
-    
+
     switch (dslash_type) {
     case QUDA_STAGGERED_DSLASH:
       // only Arnoldi, imaginary part or magnitude works (real part is degenerate)
       // We skip SM because it takes an unnecessarily long time and it's
       // covered by HISQ
       if (eig_type != QUDA_EIG_IR_ARNOLDI) return true;
-      if (spectrum != QUDA_SPECTRUM_LI_EIG && spectrum != QUDA_SPECTRUM_SI_EIG &&
-            spectrum != QUDA_SPECTRUM_LM_EIG) return true;
+      if (spectrum != QUDA_SPECTRUM_LI_EIG && spectrum != QUDA_SPECTRUM_SI_EIG && spectrum != QUDA_SPECTRUM_LM_EIG)
+        return true;
       break;
     case QUDA_ASQTAD_DSLASH:
       // only Arnoldi, imaginary part or magnitude works (real part is degenerate)
@@ -150,7 +149,7 @@ auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG);
 auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG,
                                      QUDA_SPECTRUM_SM_EIG, QUDA_SPECTRUM_LI_EIG, QUDA_SPECTRUM_SI_EIG);
 
-//using test_t = ::testing::tuple<QudaEigType,          // different types of Lanczos/Arnoldi
+// using test_t = ::testing::tuple<QudaEigType,          // different types of Lanczos/Arnoldi
 //                                QudaBoolean,          // Norm op or not
 //                                QudaBoolean,          // Preconditioned op or not
 //                                QudaBoolean,          // SVD or not
@@ -168,7 +167,6 @@ INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredEigensolveTest,
                                             Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum),
                          gettestname);
 
-
 // full system direct solve
 INSTANTIATE_TEST_SUITE_P(DirectFull, StaggeredEigensolveTest,
                          ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index dea21be65e..113d909fe7 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -122,8 +122,10 @@ void display_legacy_info()
   printfQuda("--test 2 -> --solve-type direct-pc --solution-type mat    --inv-type cg --matpc odd-odd\n");
   printfQuda("--test 3 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even\n");
   printfQuda("--test 4 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd\n");
-  printfQuda("--test 5 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even --multishift 8\n");
-  printfQuda("--test 6 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd   --multishift 8\n");
+  printfQuda(
+    "--test 5 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even --multishift 8\n");
+  printfQuda(
+    "--test 6 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd   --multishift 8\n");
 }
 
 GaugeField cpuFatQDP = {};
@@ -247,8 +249,8 @@ std::vector<std::array<double, 2>> solve(test_t param)
 
   // schwarz parameters
   auto schwarz_param = ::testing::get<6>(param);
-  inv_param.schwarz_type           = ::testing::get<0>(schwarz_param);
-  inv_param.inv_type_precondition  = ::testing::get<1>(schwarz_param);
+  inv_param.schwarz_type = ::testing::get<0>(schwarz_param);
+  inv_param.inv_type_precondition = ::testing::get<1>(schwarz_param);
   inv_param.cuda_prec_precondition = ::testing::get<2>(schwarz_param);
 
   inv_param.residual_type = ::testing::get<7>(param);
@@ -282,7 +284,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
   std::vector<quda::ColorSpinorField> out_multishift(Nsrc * multishift);
   quda::ColorSpinorParam cs_param;
   constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param);
-  std::vector<std::vector<void *>> _hp_multi_x(Nsrc, std::vector<void*>(multishift));
+  std::vector<std::vector<void *>> _hp_multi_x(Nsrc, std::vector<void *>(multishift));
 
   // Staggered vector construct END
   //-----------------------------------------------------------------------------------
@@ -370,7 +372,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
       gflops[n] = inv_param.gflops / inv_param.secs;
       iter[n] = inv_param.iter;
       printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs,
-                  inv_param.gflops / inv_param.secs);
+                 inv_param.gflops / inv_param.secs);
     }
   } else {
     inv_param.num_src = Nsrc;
@@ -392,7 +394,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
     inv_param.gflops /= comm_size() / num_sub_partition;
     quda::comm_allreduce_max(inv_param.secs);
     printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter,
-                inv_param.secs, inv_param.gflops / inv_param.secs);
+               inv_param.secs, inv_param.gflops / inv_param.secs);
   }
 
   // Free the multigrid solver
@@ -408,7 +410,8 @@ std::vector<std::array<double, 2>> solve(test_t param)
       if (multishift > 1) {
         printfQuda("\nSource %d:\n", n);
         // Create an appropriate subset of the full out_multishift vector
-        std::vector<quda::ColorSpinorField> out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift};
+        std::vector<quda::ColorSpinorField> out_subset
+          = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift};
         res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param);
       } else {
         res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param);
@@ -439,7 +442,8 @@ int main(int argc, char **argv)
   add_multigrid_option_group(app);
   add_comms_option_group(app);
   add_testing_option_group(app);
-  app->add_option("--legacy-test-info", print_legacy_info, "Print info on how to reproduce the old '--test #' behavior with flags, then exit");
+  app->add_option("--legacy-test-info", print_legacy_info,
+                  "Print info on how to reproduce the old '--test #' behavior with flags, then exit");
   try {
     app->parse(argc, argv);
   } catch (const CLI::ParseError &e) {
@@ -468,10 +472,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type))
-      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
-    if (!is_staggered(dslash_type))
-      errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
+    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON");
+    if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
   // Need to add support for LAPLACE MG?
@@ -490,12 +492,24 @@ int main(int argc, char **argv)
     // the staggered tests will fail. These checks are designed to be consistent
     // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked"
     bool changes = false;
-    if (!compute_fatlong) { compute_fatlong = true; changes = true; }
+    if (!compute_fatlong) {
+      compute_fatlong = true;
+      changes = true;
+    }
 
     double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-5 : 1e-6;
-    if (tol != expected_tol) { tol = expected_tol; changes = true; }
-    if (tol_hq != expected_tol) { tol_hq = expected_tol; changes = true; }
-    if (niter != 1000) { niter = 1000; changes = true; }
+    if (tol != expected_tol) {
+      tol = expected_tol;
+      changes = true;
+    }
+    if (tol_hq != expected_tol) {
+      tol_hq = expected_tol;
+      changes = true;
+    }
+    if (niter != 1000) {
+      niter = 1000;
+      changes = true;
+    }
 
     if (changes) {
       printfQuda("For gtest, various defaults are changed:\n");
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 27369b4a2f..a4e7bcda90 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -30,26 +30,29 @@ bool skip_test(test_t param)
   if (prec < prec_sloppy) return true;              // outer precision >= sloppy precision
   if (!(QUDA_PRECISION & prec_sloppy)) return true; // precision not enabled so skip it
   if (!(QUDA_PRECISION & prec_precondition) && prec_precondition != QUDA_INVALID_PRECISION)
-    return true; // precision not enabled so skip it
+    return true;                                    // precision not enabled so skip it
   if (prec_sloppy < prec_precondition) return true; // sloppy precision >= preconditioner precision
 
   // Skip if the inverter does not support batched update and batched update is greater than one
   if (!support_solution_accumulator_pipeline(inverter_type) && solution_accumulator_pipeline > 1) return true;
   // There's no MLocal or MdagMLocal support yet, this is left in for reference
-  //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ)
+  // if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ)
   //  if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true;
 
   if (is_laplace(dslash_type)) {
     if (multishift > 1) return true; // Laplace doesn't support multishift
-    if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE) return true; // Laplace only supports direct solves
+    if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE)
+      return true; // Laplace only supports direct solves
   }
 
   if (is_staggered(dslash_type)) {
     // the staggered and asqtad operators aren't HPD
-    if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type)) return true;
+    if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type))
+      return true;
 
     // MR struggles with the staggered and asqtad spectrum, it's not MR's fault
-    if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) return true;
+    if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER)
+      return true;
   }
 
   // split-grid doesn't support multigrid at present
@@ -87,19 +90,16 @@ TEST_P(StaggeredInvertTest, verify)
   if (solution_type == QUDA_MAT_SOLUTION) {
     if (solve_type == QUDA_DIRECT_PC_SOLVE)
       verify_tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps
-    if (solve_type == QUDA_NORMOP_SOLVE)
-      verify_tol /= (0.5 * mass); // a proxy for the condition number
+    if (solve_type == QUDA_NORMOP_SOLVE) verify_tol /= (0.5 * mass); // a proxy for the condition number
   }
 
   // The power iterations method of determining the Chebyshev window
   // breaks down due to the nature of the spectrum of the direct operator
   auto ca_basis_tmp = inv_param.ca_basis;
-  if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER)
-    inv_param.ca_basis = QUDA_POWER_BASIS;
+  if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER) inv_param.ca_basis = QUDA_POWER_BASIS;
 
   // Single precision needs a tiny bump due to small host/device precision deviations
-  if (prec == QUDA_SINGLE_PRECISION)
-    verify_tol *= 1.01;
+  if (prec == QUDA_SINGLE_PRECISION) verify_tol *= 1.01;
 
   for (auto rsd : solve(GetParam())) {
     if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], verify_tol); }
@@ -136,14 +136,14 @@ using ::testing::Combine;
 using ::testing::Values;
 
 auto staggered_pc_solvers
-  = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER, QUDA_GCR_INVERTER,
-           QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
+  = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER,
+           QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
 
 auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER);
 
-auto direct_solvers
-  = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER,
-           QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
+auto direct_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER,
+                             QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER,
+                             QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
 
 auto sloppy_precisions
   = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION);
@@ -166,8 +166,9 @@ INSTANTIATE_TEST_SUITE_P(EvenOdd, StaggeredInvertTest,
 
 // full system normal solve
 INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredInvertTest,
-                         Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE),
-                                 sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION),
+                                 Values(QUDA_NORMOP_SOLVE), sloppy_precisions, Values(1),
+                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 
 // full system direct solve
@@ -178,23 +179,22 @@ INSTANTIATE_TEST_SUITE_P(Full, StaggeredInvertTest,
 
 // preconditioned multi-shift solves
 INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest,
-                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
-                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(10),
-                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
+                                 sloppy_precisions, Values(10), solution_accumulator_pipelines, no_schwarz,
+                                 no_heavy_quark),
                          gettestname);
 
 // Heavy-Quark preconditioned solves
 INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest,
-                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
-                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1),
-                                 solution_accumulator_pipelines, no_schwarz,
+                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
+                                 sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz,
                                  Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)),
                          gettestname);
 
 // These are left in but commented out for future reference
 
 // Schwarz-preconditioned normal solves
-//INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest,
+// INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest,
 //                         Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION),
 //                                 Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1),
 //                                 solution_accumulator_pipelines,
@@ -204,11 +204,10 @@ INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest,
 //                         gettestname);
 
 // Schwarz-preconditioned direct solves
-//INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, StaggeredInvertTest,
+// INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, StaggeredInvertTest,
 //                         Combine(Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
 //                                 sloppy_precisions, Values(1), solution_accumulator_pipelines,
 //                                 Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_MR_INVERTER, QUDA_CA_GCR_INVERTER),
 //                                         Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)),
 //                                 no_heavy_quark),
 //                         gettestname);
-
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index 8530286dbd..17b12edcea 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -502,7 +502,8 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
     ->transform(CLI::QUDACheckedTransformer(mass_normalization_map));
 
   quda_app
-    ->add_option("--matpc", matpc_type, "Matrix preconditioning type (even-even (default), odd-odd, even-even-asym, odd-odd-asym)")
+    ->add_option("--matpc", matpc_type,
+                 "Matrix preconditioning type (even-even (default), odd-odd, even-even-asym, odd-odd-asym)")
     ->transform(CLI::QUDACheckedTransformer(matpc_type_map));
   quda_app->add_option("--msrc", Msrc,
                        "Used for testing non-square block blas routines where nsrc defines the other dimension");
@@ -601,9 +602,9 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
                        "The pipeline length for fused solution accumulation (default 0, no pipelining)");
 
   quda_app
-    ->add_option(
-      "--solution-type", solution_type,
-      "The solution we desire (mat (default for Wilson-type), mat-dag-mat, mat-pc (default for staggered-type), mat-pc-dag-mat-pc (default for Wilson-type multi-shift))")
+    ->add_option("--solution-type", solution_type,
+                 "The solution we desire (mat (default for Wilson-type), mat-dag-mat, mat-pc (default for "
+                 "staggered-type), mat-pc-dag-mat-pc (default for Wilson-type multi-shift))")
     ->transform(CLI::QUDACheckedTransformer(solution_type_map));
 
   quda_app
@@ -617,8 +618,9 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
     ->expected(4);
 
   quda_app
-    ->add_option("--solve-type", solve_type,
-                 "The type of solve to do (direct, direct-pc (default for staggered-type), normop, normop-pc (default for Wilson-type), normerr, normerr-pc)")
+    ->add_option(
+      "--solve-type",
+      solve_type, "The type of solve to do (direct, direct-pc (default for staggered-type), normop, normop-pc (default for Wilson-type), normerr, normerr-pc)")
     ->transform(CLI::QUDACheckedTransformer(solve_type_map));
   quda_app
     ->add_option("--solver-ext-lib-type", solver_ext_lib, "Set external library for the solvers  (default Eigen library)")
@@ -759,9 +761,12 @@ void add_eigen_option_group(std::shared_ptr<QUDAApp> quda_app)
 
   opgroup->add_option("--eig-use-dagger", eig_use_dagger,
                       "Solve the Mdag problem instead of M (MMdag if eig-use-normop == true) (default false)");
-  opgroup->add_option("--eig-use-normop", eig_use_normop,
-                      "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false for Wilson-type, true for staggered-type)");
-  opgroup->add_option("--eig-use-pc", eig_use_pc, "Solve the Even-Odd preconditioned problem (default false for Wilson-type, true for staggered-type)");
+  opgroup->add_option(
+    "--eig-use-normop",
+    eig_use_normop, "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false for Wilson-type, true for staggered-type)");
+  opgroup->add_option(
+    "--eig-use-pc", eig_use_pc,
+    "Solve the Even-Odd preconditioned problem (default false for Wilson-type, true for staggered-type)");
   opgroup->add_option("--eig-use-poly-acc", eig_use_poly_acc, "Use Chebyshev polynomial acceleration in the eigensolver");
 }
 
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 15dae3b9d2..24659b23e1 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -339,7 +339,7 @@ bool is_normal_solve(QudaInverterType inv_type, QudaSolveType solve_type)
 
 bool is_hermitian_solver(QudaInverterType type)
 {
-  switch(type) {
+  switch (type) {
   case QUDA_CG_INVERTER:
   case QUDA_CA_CG_INVERTER: return true;
   default: return false;
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 9431b3ce67..24a8668e7d 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -41,7 +41,8 @@ extern QudaPrecision &cuda_prec_refinement_sloppy;
 extern QudaPrecision &cuda_prec_ritz;
 
 // Determine if the Laplace operator has been defined
-constexpr bool is_enabled_laplace() {
+constexpr bool is_enabled_laplace()
+{
 #ifdef QUDA_LAPLACE
   return true;
 #else
diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp
index 24eface277..85e7993ba5 100644
--- a/tests/utils/staggered_gauge_utils.cpp
+++ b/tests/utils/staggered_gauge_utils.cpp
@@ -31,8 +31,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
   // Similarly, gauge links can only be built in single or double, so upscale the build precision
   // if neccessary.
   auto gauge_param = gauge_param_in;
-  if (gauge_param.cuda_prec < QUDA_SINGLE_PRECISION)
-    gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
+  if (gauge_param.cuda_prec < QUDA_SINGLE_PRECISION) gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
   gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
   gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; // probably irrelevant
 

From 431c4ecf2884e7630d8b8b64655a1cf7d0bbe3f7 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 10 Jan 2024 09:06:46 -0800
Subject: [PATCH 53/53] Updated comments in TRLM to reflect code changes

---
 lib/eig_trlm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp
index d994fbc272..02d8c26246 100644
--- a/lib/eig_trlm.cpp
+++ b/lib/eig_trlm.cpp
@@ -174,7 +174,7 @@ namespace quda
       logQuda(QUDA_SUMMARIZE, "TRLM computed the requested %d vectors in %d restart steps and %d OP*x operations.\n",
               n_conv, restart_iter, iter);
 
-      // Dump all Ritz values and residua if using Chebyshev
+      // Dump all Ritz values and residua
       for (int i = 0; i < n_conv; i++) {
         logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]);
       }