From b22beda16259eef5af67c920d9b72f24121dd856 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 17 Aug 2023 14:24:39 -0400 Subject: [PATCH 01/53] Misc things I noticed elsewhere --- lib/inv_ca_gcr.cpp | 3 +++ tests/utils/set_params.cpp | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp index f9e605ea86..feb8555334 100644 --- a/lib/inv_ca_gcr.cpp +++ b/lib/inv_ca_gcr.cpp @@ -146,6 +146,7 @@ namespace quda create(x, b); if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_PREAMBLE); + if (param.is_preconditioner) commGlobalReductionPush(param.global_reduction); // compute b2, but only if we need to bool fixed_iteration = param.sloppy_converge && n_krylov == param.maxiter && !param.compute_true_res; @@ -397,6 +398,8 @@ namespace quda } PrintSummary("CA-GCR", total_iter, r2, b2, stop, param.tol_hq); + + if (param.is_preconditioner) commGlobalReductionPop(); } } // namespace quda diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index b862c3ced4..62cd9e8c90 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -937,9 +937,11 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param) // domain decomposition preconditioner parameters inv_param.inv_type_precondition = precon_type; + inv_param.schwarz_type = precon_schwarz_type; + inv_param.precondition_cycle = precon_schwarz_cycle; inv_param.tol_precondition = tol_precondition; inv_param.maxiter_precondition = maxiter_precondition; - inv_param.verbosity_precondition = QUDA_SILENT; + inv_param.verbosity_precondition = verbosity_precondition; inv_param.cuda_prec_precondition = prec_precondition; inv_param.cuda_prec_eigensolver = prec_eigensolver; @@ -952,6 +954,11 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param) inv_param.ca_lambda_min = ca_lambda_min; inv_param.ca_lambda_max = ca_lambda_max; + // Set preconditioner CA info + inv_param.ca_basis_precondition = ca_basis_precondition; + inv_param.ca_lambda_min_precondition = ca_lambda_min_precondition; + inv_param.ca_lambda_max_precondition = ca_lambda_max_precondition; + inv_param.solution_type = solution_type; inv_param.solve_type = solve_type; inv_param.matpc_type = matpc_type; From b78720ec1f9f7dd1e0bdae5a3582c311abf37463 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 17 Aug 2023 14:41:20 -0400 Subject: [PATCH 02/53] Removed argc/argv hacks because they are not used under the hood anyway --- tests/staggered_dslash_ctest.cpp | 13 +------------ tests/staggered_dslash_test.cpp | 13 +------------ tests/staggered_dslash_test_utils.h | 16 ++++++---------- 3 files changed, 8 insertions(+), 34 deletions(-) diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index 8505b04fe6..bb20115554 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -2,9 +2,6 @@ using namespace quda; -// For loading the gauge fields -int argc_copy; -char **argv_copy; bool ctest_all_partitions = false; using ::testing::Bool; @@ -77,7 +74,7 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuplelisteners(); if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index f61e62d88c..936d457158 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -2,9 +2,6 @@ using namespace quda; -int argc_copy; -char **argv_copy; - class StaggeredDslashTest : public ::testing::Test { protected: @@ -26,7 +23,7 @@ class StaggeredDslashTest : public ::testing::Test virtual void SetUp() { - dslash_test_wrapper.init_test(argc_copy, argv_copy); + dslash_test_wrapper.init_test(); display_test_info(); } @@ -72,14 +69,6 @@ int main(int argc, char **argv) initComms(argc, argv, gridsize_from_cmdline); - // The 'SetUp()' method of the Google Test class from which DslashTest - // in derived has no arguments, but QUDA's implementation requires the - // use of argc and argv to set up the test via the function 'init'. - // As a workaround, we declare argc_copy and argv_copy as global pointers - // so that they are visible inside the 'init' function. - argc_copy = argc; - argv_copy = argv; - // Ensure gtest prints only from rank 0 ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 5c6d885673..0c8f434031 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -74,10 +74,6 @@ struct StaggeredDslashTestWrapper { Dirac *dirac; - // For loading the gauge fields - int argc_copy; - char **argv_copy; - // Split grid options bool test_split_grid = false; int num_src = 1; @@ -111,7 +107,7 @@ struct StaggeredDslashTestWrapper { } } - void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon_) + void init_ctest(int precision, QudaReconstructType link_recon_) { gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); @@ -131,10 +127,10 @@ struct StaggeredDslashTestWrapper { link_recon = link_recon_; - init(argc, argv); + init(); } - void init_test(int argc, char **argv) + void init_test() { gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); @@ -142,10 +138,10 @@ struct StaggeredDslashTestWrapper { setStaggeredGaugeParam(gauge_param); setStaggeredInvertParam(inv_param); - init(argc, argv); + init(); } - void init(int argc, char **argv) + void init() { inv_param.split_grid[0] = grid_partition[0]; inv_param.split_grid[1] = grid_partition[1]; @@ -187,7 +183,7 @@ struct StaggeredDslashTestWrapper { bool gauge_loaded = false; constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu, - qdp_fatlink_gpu, gauge_param, argc, argv, gauge_loaded); + qdp_fatlink_gpu, gauge_param, 0, nullptr, gauge_loaded); // Alright, we've created all the void** links. // Create the void* pointers From b2fc275b962601b490a2c3f2547804dfbd911429 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 18 Aug 2023 16:49:20 -0400 Subject: [PATCH 03/53] Cleaned up staggered dslash (c)test, enabled support for testing half links/other missing things --- tests/staggered_dslash_ctest.cpp | 33 ++--- tests/staggered_dslash_test.cpp | 32 ++--- tests/staggered_dslash_test_utils.h | 197 +++++++++++--------------- tests/staggered_eigensolve_test.cpp | 4 +- tests/staggered_invert_test.cpp | 4 +- tests/utils/host_utils.h | 2 +- tests/utils/staggered_gauge_utils.cpp | 6 +- tests/utils/staggered_host_utils.cpp | 15 +- 8 files changed, 127 insertions(+), 166 deletions(-) diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index bb20115554..ceffb74bb8 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -24,17 +24,6 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) { - warningQuda("Fixed precision unsupported in fat/long compute, skipping..."); - return true; - } - - if (dslash_type == QUDA_ASQTAD_DSLASH && compute_fatlong && (getReconstructNibble(recon) & 1)) { - warningQuda("Reconstruct 9 unsupported in fat/long compute, skipping..."); - return true; - } - if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) { warningQuda("Fixed precision unsupported for Laplace operator, skipping..."); return true; @@ -100,6 +89,10 @@ TEST_P(StaggeredDslashTest, verify) double deviation = dslash_test_wrapper.verify(); double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec); + if (dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9 + && dslash_test_wrapper.inv_param.cuda_prec >= QUDA_HALF_PRECISION) + tol *= 10; // if recon 9, we tolerate a greater deviation + ASSERT_LE(deviation, tol) << "Reference CPU and QUDA implementations do not agree"; } @@ -125,13 +118,9 @@ int main(int argc, char **argv) ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } - // Only these fermions are supported in this file. Ensure a reasonable default, - // ensure that the default is improved staggered - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) { - printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type), - get_dslash_str(QUDA_ASQTAD_DSLASH)); - dslash_type = QUDA_ASQTAD_DSLASH; - } + // Only these fermions are supported in this file + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't // ask to build the fat/long links... it doesn't make sense. @@ -156,12 +145,8 @@ int main(int argc, char **argv) } } - if (dslash_type == QUDA_LAPLACE_DSLASH) { - if (dtest_type != dslash_test_type::Mat) { - errorQuda("Test type %s is not supported for the Laplace operator.\n", - get_string(dtest_type_map, dtest_type).c_str()); - } - } + if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat) + errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str()); int test_rc = RUN_ALL_TESTS(); diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index 936d457158..7c3524dacf 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -48,6 +48,12 @@ TEST_F(StaggeredDslashTest, verify) double deviation = dslash_test_wrapper.verify(); double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec); + + // give it a tiny bump for fixed precision, recon 8 + if (dslash_test_wrapper.inv_param.cuda_prec <= QUDA_HALF_PRECISION && + dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9) + tol *= 1.1; + ASSERT_LE(deviation, tol) << "reference and QUDA implementations do not agree"; } @@ -56,6 +62,9 @@ int main(int argc, char **argv) // initalize google test ::testing::InitGoogleTest(&argc, argv); + // override the default dslash from Wilson + dslash_type = QUDA_ASQTAD_DSLASH; + // command line options auto app = make_app(); app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map)); @@ -73,13 +82,9 @@ int main(int argc, char **argv) ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } - // Only these fermions are supported in this file. Ensure a reasonable default, - // ensure that the default is improved staggered - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) { - printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type), - get_dslash_str(QUDA_ASQTAD_DSLASH)); - dslash_type = QUDA_ASQTAD_DSLASH; - } + // Only these fermions are supported in this file + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, // and don't ask to build the fat/long links... it doesn't make sense. @@ -103,17 +108,8 @@ int main(int argc, char **argv) } } - if (dslash_type == QUDA_LAPLACE_DSLASH) { - if (dtest_type != dslash_test_type::Mat) { - errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str()); - } - } - - // If we're building fat/long links, there are some - // tests we have to skip. - if (dslash_type == QUDA_ASQTAD_DSLASH && compute_fatlong) { - if (prec < QUDA_SINGLE_PRECISION) { errorQuda("Fixed-point precision unsupported in fat/long compute"); } - } + if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat) + errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str()); int test_rc = RUN_ALL_TESTS(); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 0c8f434031..65cbd21dfa 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -44,17 +44,9 @@ struct DslashTime { struct StaggeredDslashTestWrapper { - void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; - QudaGaugeParam gauge_param; QudaInvertParam inv_param; - void *milc_fatlink_gpu; - void *milc_longlink_gpu; - - cpuGaugeField *cpuFat = nullptr; - cpuGaugeField *cpuLong = nullptr; - ColorSpinorField spinor; ColorSpinorField spinorOut; ColorSpinorField spinorRef; @@ -65,10 +57,13 @@ struct StaggeredDslashTestWrapper { std::vector vp_spinor; std::vector vp_spinor_out; - // In the HISQ case, we include building fat/long links in this unit test - void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr}; - void **ghost_fatlink_cpu = nullptr, **ghost_longlink_cpu = nullptr; + void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; + void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; + void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; + void *milc_fatlink = nullptr; + void *milc_longlink = nullptr; + GaugeField *cpuFat = nullptr; + GaugeField *cpuLong = nullptr; QudaParity parity = QUDA_EVEN_PARITY; @@ -78,24 +73,27 @@ struct StaggeredDslashTestWrapper { bool test_split_grid = false; int num_src = 1; + // Whether or not we need the ghost zones + bool need_ghost_zone = false; + void staggeredDslashRef() { // compare to dslash reference implementation printfQuda("Calculating reference implementation..."); switch (dtest_type) { case dslash_test_type::Dslash: - staggeredDslash(spinorRef, qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, spinor, + staggeredDslash(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor, parity, dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); break; case dslash_test_type::MatPC: - staggeredMatDagMat(spinorRef, qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, spinor, + staggeredMatDagMat(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor, mass, 0, inv_param.cpu_prec, gauge_param.cpu_prec, tmpCpu, parity, dslash_type); break; case dslash_test_type::Mat: // the !dagger is to reconcile the QUDA convention of D_stag = {{ 2m, -D_{eo}}, -D_{oe}, 2m}} vs the host convention without the minus signs - staggeredDslash(spinorRef.Even(), qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, + staggeredDslash(spinorRef.Even(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor.Odd(), QUDA_EVEN_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); - staggeredDslash(spinorRef.Odd(), qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, + staggeredDslash(spinorRef.Odd(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor.Even(), QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); if (dslash_type == QUDA_LAPLACE_DSLASH) { xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec); @@ -116,7 +114,6 @@ struct StaggeredDslashTestWrapper { setStaggeredInvertParam(inv_param); auto prec = getPrecision(precision); - setVerbosity(QUDA_SUMMARIZE); gauge_param.cuda_prec = prec; gauge_param.cuda_prec_sloppy = prec; @@ -161,88 +158,53 @@ struct StaggeredDslashTestWrapper { Nsrc = 1; } - // Allocate a lot of memory because I'm very confused - void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - - milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - - void *qdp_fatlink_gpu[4]; - void *qdp_longlink_gpu[4]; - + // Allocate fields for (int dir = 0; dir < 4; dir++) { qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - - qdp_fatlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - - qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_fatlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); } + milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - bool gauge_loaded = false; - constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu, - qdp_fatlink_gpu, gauge_param, 0, nullptr, gauge_loaded); - - // Alright, we've created all the void** links. - // Create the void* pointers - reorderQDPtoMILC(milc_fatlink_gpu, qdp_fatlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - // Create ghost zones for CPU fields, - // prepare and load the GPU fields - -#ifdef MULTI_GPU - gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS; + // For load, etc gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; - GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink_cpu); - cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuFat = new cpuGaugeField(cpuFatParam); - ghost_fatlink_cpu = cpuFat->Ghost(); - - if (dslash_type == QUDA_ASQTAD_DSLASH) { - gauge_param.type = QUDA_ASQTAD_LONG_LINKS; - GaugeFieldParam cpuLongParam(gauge_param, milc_longlink_cpu); - cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuLong = new cpuGaugeField(cpuLongParam); - ghost_longlink_cpu = cpuLong ? cpuLong->Ghost() : nullptr; - } -#endif - - gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS; - if (dslash_type == QUDA_STAGGERED_DSLASH) { - gauge_param.reconstruct = gauge_param.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ? - QUDA_RECONSTRUCT_13 : - (link_recon == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_9 : - link_recon; - } else { - gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; - } - // set verbosity prior to loadGaugeQuda - setVerbosity(verbosity); + // Dummy arg needed because other tests load the gauge field more than once + bool gauge_loaded = false; + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, gauge_loaded); + // Reorder gauge fields to MILC order + reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + + // Create ghost gauge fields in case of multi GPU builds. + gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ? + QUDA_SU3_LINKS : + QUDA_ASQTAD_FAT_LINKS; + gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + gauge_param.location = QUDA_CPU_FIELD_LOCATION; - printfQuda("Sending fat links to GPU\n"); - loadGaugeQuda(milc_fatlink_gpu, &gauge_param); + GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink); + cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; + cpuFat = GaugeField::Create(cpuFatParam); gauge_param.type = QUDA_ASQTAD_LONG_LINKS; - -#ifdef MULTI_GPU - gauge_param.ga_pad *= 3; -#endif - - if (dslash_type == QUDA_ASQTAD_DSLASH) { - gauge_param.staggered_phase_type = QUDA_STAGGERED_PHASE_NO; - gauge_param.reconstruct = gauge_param.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ? - QUDA_RECONSTRUCT_13 : - (link_recon == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_9 : - link_recon; - printfQuda("Sending long links to GPU\n"); - loadGaugeQuda(milc_longlink_gpu, &gauge_param); + GaugeFieldParam cpuLongParam(gauge_param, milc_longlink); + cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; + cpuLong = GaugeField::Create(cpuLongParam); + + // Override link reconstruct as appropriate for staggered or asqtad + if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) { + if (link_recon == QUDA_RECONSTRUCT_12) link_recon = QUDA_RECONSTRUCT_13; + if (link_recon == QUDA_RECONSTRUCT_8) link_recon = QUDA_RECONSTRUCT_9; } + loadFatLongGaugeQuda(milc_fatlink, milc_longlink, gauge_param); + + // reset the reconstruct in gauge param + gauge_param.reconstruct = link_recon; + + // Create an appropriate host ColorSpinorParam ColorSpinorParam csParam; csParam.nColor = 3; csParam.nSpin = 1; @@ -251,7 +213,6 @@ struct StaggeredDslashTestWrapper { csParam.x[4] = 1; csParam.setPrecision(inv_param.cpu_prec); - // inv_param.solution_type = QUDA_MAT_SOLUTION; csParam.pad = 0; if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; @@ -298,39 +259,40 @@ struct StaggeredDslashTestWrapper { DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); dirac = Dirac::create(diracParam); - - for (int dir = 0; dir < 4; dir++) { - host_free(qdp_fatlink_gpu[dir]); - host_free(qdp_longlink_gpu[dir]); - host_free(qdp_inlink[dir]); - } - host_free(milc_fatlink_cpu); - host_free(milc_longlink_cpu); } void end() { for (int dir = 0; dir < 4; dir++) { - if (qdp_fatlink_cpu[dir] != nullptr) { - host_free(qdp_fatlink_cpu[dir]); - qdp_fatlink_cpu[dir] = nullptr; + if (qdp_inlink[dir] != nullptr) { + host_free(qdp_inlink[dir]); + qdp_inlink[dir] = nullptr; + } + if (qdp_fatlink[dir] != nullptr) { + host_free(qdp_fatlink[dir]); + qdp_fatlink[dir] = nullptr; } - if (qdp_longlink_cpu[dir] != nullptr) { - host_free(qdp_longlink_cpu[dir]); - qdp_longlink_cpu[dir] = nullptr; + if (qdp_longlink[dir] != nullptr) { + host_free(qdp_longlink[dir]); + qdp_longlink[dir] = nullptr; } } + if (milc_fatlink) { + host_free(milc_fatlink); + milc_fatlink = nullptr; + } + + if (milc_longlink) { + host_free(milc_longlink); + milc_longlink = nullptr; + } + if (dirac != nullptr) { delete dirac; dirac = nullptr; } - host_free(milc_fatlink_gpu); - milc_fatlink_gpu = nullptr; - host_free(milc_longlink_gpu); - milc_longlink_gpu = nullptr; - freeGaugeQuda(); if (cpuFat) { @@ -362,7 +324,7 @@ struct StaggeredDslashTestWrapper { _hp_x[i] = vp_spinor_out[i].V(); _hp_b[i] = vp_spinor[i].V(); } - dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink_gpu, milc_longlink_gpu, + dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, qdp_fatlink, qdp_longlink, &gauge_param); } else { @@ -371,11 +333,18 @@ struct StaggeredDslashTestWrapper { host_timer.start(); - switch (dtest_type) { - case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break; - case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break; - case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break; - default: errorQuda("Test type %d not defined on staggered dslash", static_cast(dtest_type)); + if (dslash_type == QUDA_LAPLACE_DSLASH) { + switch (dtest_type) { + case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break; + default: errorQuda("Test type %d not defined on Laplace operator", static_cast(dtest_type)); + } + } else { + switch (dtest_type) { + case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break; + case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break; + case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break; + default: errorQuda("Test type %d not defined on staggered dslash", static_cast(dtest_type)); + } } host_timer.stop(); diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 70877d36d2..911d58a2f9 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -132,7 +132,9 @@ int main(int argc, char **argv) milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv); + // Dummy arg needed because other tests load the gauge field more than once + bool gauge_loaded = false; + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, gauge_loaded); // Compute plaquette. Routine is aware that the gauge fields already have the phases on them. double plaq[3]; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 27d752f1b6..4e2481da84 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -234,7 +234,9 @@ int main(int argc, char **argv) // For load, etc gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv); + // Dummy arg needed because other tests load the gauge field more than once + bool gauge_loaded = false; + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, gauge_loaded); // Reorder gauge fields to MILC order reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 66c46fb5ea..1d2692b25e 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -51,7 +51,7 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded); void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv); + QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded); void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *, QudaDslashType dslash_type); void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param); diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp index 2759e3489b..e9e18948d4 100644 --- a/tests/utils/staggered_gauge_utils.cpp +++ b/tests/utils/staggered_gauge_utils.cpp @@ -26,8 +26,12 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat void **qdp_inlink, QudaGaugeParam &gauge_param_in, double **act_path_coeffs, double eps_naik, size_t gSize, int n_naiks) { - // since a lot of intermediaries can be general matrices, override the recon in `gauge_param_in` + // Intermediates can be general matrices, so override the reconstruct. + // Similarly, gauge links can only be built in single or double, so upscale the build precision + // if neccessary. auto gauge_param = gauge_param_in; + if (gauge_param.cuda_prec < QUDA_SINGLE_PRECISION) + gauge_param.cuda_prec = QUDA_SINGLE_PRECISION; gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; // probably irrelevant diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index 021bbd6877..4694aa829a 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -75,16 +75,19 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli } void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv) + QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded) { gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; if (latfile.size() > 0) { - // load in the command line supplied gauge field using QIO and LIME - read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); - if (dslash_type != QUDA_LAPLACE_DSLASH) { - applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); - } + if (!gauge_loaded) { + // load in the command line supplied gauge field using QIO and LIME + read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); + if (dslash_type != QUDA_LAPLACE_DSLASH) { + applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); + } + gauge_loaded = true; + } // else gauge already loaded } else { int construct_type = (unit_gauge) ? 0 : 1; if (dslash_type == QUDA_LAPLACE_DSLASH) { From cc31020ebda149ad98aa596e2cd1cf9ffa0d28e4 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 29 Aug 2023 18:27:56 -0400 Subject: [PATCH 04/53] Various cleanup of gauge fields in staggered test exes --- tests/host_reference/dslash_reference.cpp | 12 +-- tests/host_reference/dslash_reference.h | 3 +- .../staggered_dslash_reference.cpp | 93 ++++++++----------- .../staggered_dslash_reference.h | 19 ++-- tests/staggered_dslash_test_utils.h | 18 ++-- tests/staggered_invert_test.cpp | 12 +-- 6 files changed, 68 insertions(+), 89 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 9fc53fe6bf..63ed621c80 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -744,8 +744,7 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou } double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[], - void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param, + quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift) { switch (test_type) { @@ -757,10 +756,8 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not // have the minus sign. Passing in QUDA_DAG_YES solves this // discrepancy. - staggeredDslash(ref.Even(), qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out.Odd(), QUDA_EVEN_PARITY, - QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); - staggeredDslash(ref.Odd(), qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out.Even(), QUDA_ODD_PARITY, - QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); + staggeredDslash(ref.Even(), fat_link, long_link, out.Odd(), QUDA_EVEN_PARITY, QUDA_DAG_YES, dslash_type); + staggeredDslash(ref.Odd(), fat_link, long_link, out.Even(), QUDA_ODD_PARITY, QUDA_DAG_YES, dslash_type); if (dslash_type == QUDA_LAPLACE_DSLASH) { xpay(out.V(), kappa, ref.V(), ref.Length(), gauge_param.cpu_prec); @@ -775,8 +772,7 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi case 5: // multi mass CG, even parity solution, solving EVEN system case 6: // multi mass CG, odd parity solution, solving ODD system - staggeredMatDagMat(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, - gauge_param.cpu_prec, tmp, + staggeredMatDagMat(ref, fat_link, long_link, out, mass, 0, tmp, (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type); break; } diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 44392628c2..42f90fed91 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -109,8 +109,7 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv); double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[], - void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param, + quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift); // i represents a "half index" into an even or odd "half lattice". diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 86ecd17464..24a2932078 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -11,6 +11,7 @@ #include #include "misc.h" #include +#include #include @@ -32,24 +33,24 @@ template void display_link_internal(Float *link) // if oddBit is one: calculate odd parity spinor elements // if daggerBit is zero: perform ordinary dslash operator // if daggerBit is one: perform hermitian conjugate of dslash -template +template #ifdef MULTI_GPU -void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, gFloat **ghostFatlink, - gFloat **ghostLonglink, sFloat *spinorField, sFloat **fwd_nbr_spinor, - sFloat **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type) +void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink, + real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor, + real_t **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type) #else -void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, gFloat **, gFloat **, sFloat *spinorField, - sFloat **, sFloat **, int oddBit, int daggerBit, QudaDslashType dslash_type) +void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **, real_t **, real_t *spinorField, + real_t **, real_t **, int oddBit, int daggerBit, QudaDslashType dslash_type) #endif { for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0; - gFloat *fatlinkEven[4], *fatlinkOdd[4]; - gFloat *longlinkEven[4], *longlinkOdd[4]; + real_t *fatlinkEven[4], *fatlinkOdd[4]; + real_t *longlinkEven[4], *longlinkOdd[4]; #ifdef MULTI_GPU - gFloat *ghostFatlinkEven[4], *ghostFatlinkOdd[4]; - gFloat *ghostLonglinkEven[4], *ghostLonglinkOdd[4]; + real_t *ghostFatlinkEven[4], *ghostFatlinkOdd[4]; + real_t *ghostLonglinkEven[4], *ghostLonglinkOdd[4]; #endif for (int dir = 0; dir < 4; dir++) { @@ -72,28 +73,28 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, for (int dir = 0; dir < 8; dir++) { #ifdef MULTI_GPU const int nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1; - gFloat *fatlnk + real_t *fatlnk = gaugeLink_mg4dir(sid, dir, oddBit, fatlinkEven, fatlinkOdd, ghostFatlinkEven, ghostFatlinkOdd, 1, 1); - gFloat *longlnk = dslash_type == QUDA_ASQTAD_DSLASH ? + real_t *longlnk = dslash_type == QUDA_ASQTAD_DSLASH ? gaugeLink_mg4dir(sid, dir, oddBit, longlinkEven, longlinkOdd, ghostLonglinkEven, ghostLonglinkOdd, 3, 3) : nullptr; - sFloat *first_neighbor_spinor = spinorNeighbor_5d_mgpu( + real_t *first_neighbor_spinor = spinorNeighbor_5d_mgpu( sid, dir, oddBit, spinorField, fwd_nbr_spinor, back_nbr_spinor, 1, nFace, stag_spinor_site_size); - sFloat *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ? + real_t *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ? spinorNeighbor_5d_mgpu(sid, dir, oddBit, spinorField, fwd_nbr_spinor, back_nbr_spinor, 3, nFace, stag_spinor_site_size) : nullptr; #else - gFloat *fatlnk = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, 1); - gFloat *longlnk + real_t *fatlnk = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, 1); + real_t *longlnk = dslash_type == QUDA_ASQTAD_DSLASH ? gaugeLink(sid, dir, oddBit, longlinkEven, longlinkOdd, 3) : nullptr; - sFloat *first_neighbor_spinor + real_t *first_neighbor_spinor = spinorNeighbor_5d(sid, dir, oddBit, spinorField, 1, stag_spinor_site_size); - sFloat *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ? + real_t *third_neighbor_spinor = dslash_type == QUDA_ASQTAD_DSLASH ? spinorNeighbor_5d(sid, dir, oddBit, spinorField, 3, stag_spinor_site_size) : nullptr; #endif - sFloat gaugedSpinor[stag_spinor_site_size]; + real_t gaugedSpinor[stag_spinor_site_size]; if (dir % 2 == 0) { su3Mul(gaugedSpinor, fatlnk, first_neighbor_spinor); @@ -122,10 +123,12 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, } // 4-d volume } -void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit, - QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type) +void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type) { + // assert sPrecision and gPrecision must be the same + if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } + QudaParity otherparity = QUDA_INVALID_PARITY; if (oddBit == QUDA_EVEN_PARITY) { otherparity = QUDA_ODD_PARITY; @@ -141,36 +144,24 @@ void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, voi void **fwd_nbr_spinor = in.fwdGhostFaceBuffer; void **back_nbr_spinor = in.backGhostFaceBuffer; - if (sPrecision == QUDA_DOUBLE_PRECISION) { - if (gPrecision == QUDA_DOUBLE_PRECISION) { - staggeredDslashReference((double *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink, - (double **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor, - (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type); - } else { - staggeredDslashReference((double *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink, - (float **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor, - (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type); - } - } else { - if (gPrecision == QUDA_DOUBLE_PRECISION) { - staggeredDslashReference((float *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink, - (double **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor, - (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type); - } else { - staggeredDslashReference((float *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink, - (float **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor, - (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type); - } + if (in.Precision() == QUDA_DOUBLE_PRECISION) { + staggeredDslashReference((double *)out.V(), (double **)fat_link.Gauge_p(), (double **)long_link.Gauge_p(), + (double **)fat_link.Ghost(), (double **)long_link.Ghost(), + (double *)in.V(), (double **)fwd_nbr_spinor, + (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type); + } else if (in.Precision() == QUDA_SINGLE_PRECISION) { + staggeredDslashReference((float *)out.V(), (float **)fat_link.Gauge_p(), (float **)long_link.Gauge_p(), + (float **)fat_link.Ghost(), (float **)long_link.Ghost(), + (float *)in.V(), (float **)fwd_nbr_spinor, + (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type); } } -void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit, - QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity, - QudaDslashType dslash_type) +void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit, + ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same - if (sPrecision != gPrecision) { errorQuda("Spinor precision and gPrecison is not the same"); } + if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); } QudaParity otherparity = QUDA_INVALID_PARITY; if (parity == QUDA_EVEN_PARITY) { @@ -181,14 +172,12 @@ void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, errorQuda("full parity not supported in function"); } - staggeredDslash(tmp, fatlink, longlink, ghost_fatlink, ghost_longlink, in, otherparity, dagger_bit, sPrecision, - gPrecision, dslash_type); + staggeredDslash(tmp, fat_link, long_link, in, otherparity, dagger_bit, dslash_type); - staggeredDslash(out, fatlink, longlink, ghost_fatlink, ghost_longlink, tmp, parity, dagger_bit, sPrecision, - gPrecision, dslash_type); + staggeredDslash(out, fat_link, long_link, tmp, parity, dagger_bit, dslash_type); double msq_x4 = mass * mass * 4; - if (sPrecision == QUDA_DOUBLE_PRECISION) { + if (in.Precision() == QUDA_DOUBLE_PRECISION) { axmy((double *)in.V(), (double)msq_x4, (double *)out.V(), Vh * stag_spinor_site_size); } else { axmy((float *)in.V(), (float)msq_x4, (float *)out.V(), Vh * stag_spinor_site_size); diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index 54d40fdc0d..4a473c114d 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -11,16 +11,13 @@ using namespace quda; void setDims(int *); -template -void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, gFloat **ghostFatlink, - gFloat **ghostLonglink, sFloat *spinorField, sFloat **fwd_nbr_spinor, - sFloat **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type); +template +void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink, + real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor, + real_t **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type); -void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit, - QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type); +void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, int oddBit, int daggerBit, + QudaDslashType dslash_type); -void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit, - QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity, - QudaDslashType dslash_type); +void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit, + ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 65cbd21dfa..73a1cac005 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -82,19 +82,15 @@ struct StaggeredDslashTestWrapper { printfQuda("Calculating reference implementation..."); switch (dtest_type) { case dslash_test_type::Dslash: - staggeredDslash(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor, - parity, dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); + staggeredDslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type); break; case dslash_test_type::MatPC: - staggeredMatDagMat(spinorRef, qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), spinor, - mass, 0, inv_param.cpu_prec, gauge_param.cpu_prec, tmpCpu, parity, dslash_type); + staggeredMatDagMat(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type); break; case dslash_test_type::Mat: // the !dagger is to reconcile the QUDA convention of D_stag = {{ 2m, -D_{eo}}, -D_{oe}, 2m}} vs the host convention without the minus signs - staggeredDslash(spinorRef.Even(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), - spinor.Odd(), QUDA_EVEN_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); - staggeredDslash(spinorRef.Odd(), qdp_fatlink, qdp_longlink, (void**)cpuFat->Ghost(), (void**)cpuLong->Ghost(), - spinor.Even(), QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); + staggeredDslash(spinorRef.Even(), *cpuFat, *cpuLong, spinor.Odd(), QUDA_EVEN_PARITY, !dagger, dslash_type); + staggeredDslash(spinorRef.Odd(), *cpuFat, *cpuLong, spinor.Even(), QUDA_ODD_PARITY, !dagger, dslash_type); if (dslash_type == QUDA_LAPLACE_DSLASH) { xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec); } else { @@ -184,12 +180,14 @@ struct StaggeredDslashTestWrapper { gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; gauge_param.location = QUDA_CPU_FIELD_LOCATION; - GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink); + GaugeFieldParam cpuFatParam(gauge_param, qdp_fatlink); + cpuFatParam.order = QUDA_QDP_GAUGE_ORDER; cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuFat = GaugeField::Create(cpuFatParam); gauge_param.type = QUDA_ASQTAD_LONG_LINKS; - GaugeFieldParam cpuLongParam(gauge_param, milc_longlink); + GaugeFieldParam cpuLongParam(gauge_param, qdp_longlink); + cpuLongParam.order = QUDA_QDP_GAUGE_ORDER; cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuLong = GaugeField::Create(cpuLongParam); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 79d099afb3..210a10c176 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -261,12 +261,14 @@ int main(int argc, char **argv) gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; gauge_param.location = QUDA_CPU_FIELD_LOCATION; - GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink); + GaugeFieldParam cpuFatParam(gauge_param, qdp_fatlink); + cpuFatParam.order = QUDA_QDP_GAUGE_ORDER; cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuFat = GaugeField::Create(cpuFatParam); gauge_param.type = QUDA_ASQTAD_LONG_LINKS; - GaugeFieldParam cpuLongParam(gauge_param, milc_longlink); + GaugeFieldParam cpuLongParam(gauge_param, qdp_longlink); + cpuLongParam.order = QUDA_QDP_GAUGE_ORDER; cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuLong = GaugeField::Create(cpuLongParam); @@ -361,8 +363,7 @@ int main(int argc, char **argv) for (int k = 0; k < Nsrc; k++) { if (verify_results) - verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, qdp_fatlink, qdp_longlink, (void **)cpuFat->Ghost(), - (void **)cpuLong->Ghost(), gauge_param, inv_param, 0); + verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0); } } else if (test_type == 5 || test_type == 6) { // case 5: // multi mass CG, even parity solution, solving EVEN system @@ -417,8 +418,7 @@ int main(int argc, char **argv) for (int i = 0; i < multishift; i++) { printfQuda("%dth solution: mass=%f, ", i, masses[i]); - verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], qdp_fatlink, qdp_longlink, - (void **)cpuFat->Ghost(), (void **)cpuLong->Ghost(), gauge_param, inv_param, i); + verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i); } } } else { From 104c404366bbebbbc3ea0e483663aecdcc8edd0a Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 30 Aug 2023 12:28:01 -0400 Subject: [PATCH 05/53] Various bits of function cleanup, making host verify names more consistent with Wilson-type verifies --- tests/host_reference/dslash_reference.cpp | 23 ++++------- .../staggered_dslash_reference.cpp | 39 +++++++++++++++---- .../staggered_dslash_reference.h | 11 ++++-- tests/staggered_dslash_test_utils.h | 13 ++----- 4 files changed, 50 insertions(+), 36 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 63ed621c80..7841d63d4a 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -747,24 +747,17 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift) { + int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; + switch (test_type) { case 0: // full parity solution, full parity system case 1: // full parity solution, solving EVEN EVEN prec system case 2: // full parity solution, solving ODD ODD prec system + stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); - // In QUDA, the full staggered operator has the sign convention - // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not - // have the minus sign. Passing in QUDA_DAG_YES solves this - // discrepancy. - staggeredDslash(ref.Even(), fat_link, long_link, out.Odd(), QUDA_EVEN_PARITY, QUDA_DAG_YES, dslash_type); - staggeredDslash(ref.Odd(), fat_link, long_link, out.Even(), QUDA_ODD_PARITY, QUDA_DAG_YES, dslash_type); - - if (dslash_type == QUDA_LAPLACE_DSLASH) { - xpay(out.V(), kappa, ref.V(), ref.Length(), gauge_param.cpu_prec); - ax(0.5 / kappa, ref.V(), ref.Length(), gauge_param.cpu_prec); - } else { - axpy(2 * mass, out.V(), ref.V(), ref.Length(), gauge_param.cpu_prec); - } + // exact reason for this tbd, this isn't needed in the dslash test... + if (dslash_type == QUDA_LAPLACE_DSLASH) + ax(0.5 / kappa, ref.V(), ref.Length(), ref.Precision()); break; case 3: // even parity solution, solving EVEN system @@ -772,8 +765,8 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi case 5: // multi mass CG, even parity solution, solving EVEN system case 6: // multi mass CG, odd parity solution, solving ODD system - staggeredMatDagMat(ref, fat_link, long_link, out, mass, 0, tmp, - (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type); + stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, + (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type); break; } diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 24a2932078..c263fa93ef 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -123,8 +123,8 @@ void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, } // 4-d volume } -void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type) +void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } @@ -157,8 +157,33 @@ void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const Ga } } -void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit, - ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type) +void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) +{ + // assert sPrecision and gPrecision must be the same + if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } + + // assert we have full-parity spinors + if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) + errorQuda("Unexpected site subsets for stag_mat, out %d in %d", out.SiteSubset(), in.SiteSubset()); + + // In QUDA, the full staggered operator has the sign convention + // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not + // have the minus sign. Inverting the expected dagger convention + // solves this discrepancy. + stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type); + stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type); + + if (dslash_type == QUDA_LAPLACE_DSLASH) { + double kappa = 1.0 / (8 + mass); + xpay((void*)in.V(), kappa, out.V(), out.Length(), out.Precision()); + } else { + axpy(2 * mass, (void*)in.V(), out.V(), out.Length(), out.Precision()); + } +} + +void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int, + ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); } @@ -172,9 +197,9 @@ void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const errorQuda("full parity not supported in function"); } - staggeredDslash(tmp, fat_link, long_link, in, otherparity, dagger_bit, dslash_type); - - staggeredDslash(out, fat_link, long_link, tmp, parity, dagger_bit, dslash_type); + // dagger bit does not matter + stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type); + stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type); double msq_x4 = mass * mass * 4; if (in.Precision() == QUDA_DOUBLE_PRECISION) { diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index 4a473c114d..9fc6c9d641 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -16,8 +16,11 @@ void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor, real_t **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type); -void staggeredDslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, int oddBit, int daggerBit, - QudaDslashType dslash_type); +void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, + int oddBit, int daggerBit, QudaDslashType dslash_type); -void staggeredMatDagMat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit, - ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type); +void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, + double mass, int daggerBit, QudaDslashType dslash_type); + +void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, + double mass, int dagger_bit, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 73a1cac005..b59d21d6b6 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -82,20 +82,13 @@ struct StaggeredDslashTestWrapper { printfQuda("Calculating reference implementation..."); switch (dtest_type) { case dslash_test_type::Dslash: - staggeredDslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type); + stag_dslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type); break; case dslash_test_type::MatPC: - staggeredMatDagMat(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type); + stag_matpc(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type); break; case dslash_test_type::Mat: - // the !dagger is to reconcile the QUDA convention of D_stag = {{ 2m, -D_{eo}}, -D_{oe}, 2m}} vs the host convention without the minus signs - staggeredDslash(spinorRef.Even(), *cpuFat, *cpuLong, spinor.Odd(), QUDA_EVEN_PARITY, !dagger, dslash_type); - staggeredDslash(spinorRef.Odd(), *cpuFat, *cpuLong, spinor.Even(), QUDA_ODD_PARITY, !dagger, dslash_type); - if (dslash_type == QUDA_LAPLACE_DSLASH) { - xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec); - } else { - axpy(2 * mass, spinor.V(), spinorRef.V(), spinor.Length(), gauge_param.cpu_prec); - } + stag_mat(spinorRef, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type); break; default: errorQuda("Test type %d not defined", static_cast(dtest_type)); } From 1f8f89c2d0e4e52cb5d077f4710caa399e71b023 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 31 Aug 2023 13:25:33 -0400 Subject: [PATCH 06/53] Added support for mdagm tests for staggered, asqtad --- tests/staggered_dslash_ctest.cpp | 11 ++++++----- tests/staggered_dslash_test_utils.h | 15 +++++++++++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index ceffb74bb8..6945e4ab13 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -20,14 +20,11 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple(::testing::get<1>(GetParam())); if ((QUDA_PRECISION & getPrecision(::testing::get<0>(GetParam()))) == 0 - || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) { + || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) return true; - } - if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) { - warningQuda("Fixed precision unsupported for Laplace operator, skipping..."); + if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) return true; - } const std::array partition_enabled {true, true, true, false, true, false, false, false, true, false, false, false, true, false, true, true}; @@ -102,6 +99,10 @@ int main(int argc, char **argv) { // initalize google test ::testing::InitGoogleTest(&argc, argv); + + // override the default dslash from Wilson + dslash_type = QUDA_ASQTAD_DSLASH; + auto app = make_app(); app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map)); app->add_option("--all-partitions", ctest_all_partitions, "Test all instead of reduced combination of partitions"); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index b59d21d6b6..5ee65c23f1 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -24,10 +24,12 @@ using namespace quda; dslash_test_type dtest_type = dslash_test_type::Dslash; CLI::TransformPairs dtest_type_map { - {"Dslash", dslash_test_type::Dslash}, {"MatPC", dslash_test_type::MatPC}, {"Mat", dslash_test_type::Mat} - // left here for completeness but not support in staggered dslash test + {"Dslash", dslash_test_type::Dslash}, + {"MatPC", dslash_test_type::MatPC}, + {"Mat", dslash_test_type::Mat}, + {"MatDagMat", dslash_test_type::MatDagMat}, + // left here for completeness but not supported in staggered dslash test // {"MatPCDagMatPC", dslash_test_type::MatPCDagMatPC}, - // {"MatDagMat", dslash_test_type::MatDagMat}, // {"M5", dslash_test_type::M5}, // {"M5inv", dslash_test_type::M5inv}, // {"Dslash4pre", dslash_test_type::Dslash4pre} @@ -90,6 +92,10 @@ struct StaggeredDslashTestWrapper { case dslash_test_type::Mat: stag_mat(spinorRef, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type); break; + case dslash_test_type::MatDagMat: + stag_mat(tmpCpu, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type); + stag_mat(spinorRef, *cpuFat, *cpuLong, tmpCpu, mass, 1 - dagger, dslash_type); + break; default: errorQuda("Test type %d not defined", static_cast(dtest_type)); } } @@ -205,7 +211,7 @@ struct StaggeredDslashTestWrapper { csParam.setPrecision(inv_param.cpu_prec); csParam.pad = 0; - if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) { + if (dtest_type != dslash_test_type::Mat && dtest_type != dslash_test_type::MatDagMat) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; inv_param.solution_type = QUDA_MATPC_SOLUTION; @@ -334,6 +340,7 @@ struct StaggeredDslashTestWrapper { case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break; case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break; case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break; + case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinor); break; default: errorQuda("Test type %d not defined on staggered dslash", static_cast(dtest_type)); } } From 94a332f1774f52055adb280d8fc488326462ff61 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 12 Sep 2023 11:19:12 -0400 Subject: [PATCH 07/53] Small cleanup of treatment of naik terms --- tests/staggered_dslash_ctest.cpp | 21 ++++++-------------- tests/staggered_dslash_test.cpp | 20 ++++++------------- tests/utils/set_params.cpp | 34 ++++++++++---------------------- 3 files changed, 22 insertions(+), 53 deletions(-) diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index 6945e4ab13..669482d96a 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -125,25 +125,16 @@ int main(int argc, char **argv) // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't // ask to build the fat/long links... it doesn't make sense. - if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) { + if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) errorQuda( "Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\".\n"); - compute_fatlong = true; - } // Set n_naiks to 2 if eps_naik != 0.0 - if (dslash_type == QUDA_ASQTAD_DSLASH) { - if (eps_naik != 0.0) { - if (compute_fatlong) { - n_naiks = 2; - printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n"); - } else { - eps_naik = 0.0; - printfQuda("Not computing fat-long, ignoring epsilon correction.\n"); - } - } else { - printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n"); - } + if (eps_naik != 0.0) { + if (compute_fatlong) + n_naiks = 2; + else + eps_naik = 0.0; // to avoid potential headaches } if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat) diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index 7c3524dacf..e24c8092c9 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -88,24 +88,16 @@ int main(int argc, char **argv) // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, // and don't ask to build the fat/long links... it doesn't make sense. - if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) { + if (latfile.size() > 0 && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) errorQuda( "Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\"."); - } // Set n_naiks to 2 if eps_naik != 0.0 - if (dslash_type == QUDA_ASQTAD_DSLASH) { - if (eps_naik != 0.0) { - if (compute_fatlong) { - n_naiks = 2; - printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n"); - } else { - eps_naik = 0.0; - printfQuda("Not computing fat-long, ignoring epsilon correction.\n"); - } - } else { - printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n"); - } + if (eps_naik != 0.0) { + if (compute_fatlong) + n_naiks = 2; + else + eps_naik = 0.0; // to avoid potential headaches } if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat) diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index 62cd9e8c90..aa95dbbd34 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -1429,18 +1429,11 @@ void setQudaStaggeredInvTestParams() } // Set n_naiks to 2 if eps_naik != 0.0 - if (dslash_type == QUDA_ASQTAD_DSLASH) { - if (eps_naik != 0.0) { - if (compute_fatlong) { - n_naiks = 2; - printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n"); - } else { - eps_naik = 0.0; - printfQuda("Not computing fat-long, ignoring epsilon correction.\n"); - } - } else { - printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n"); - } + if (eps_naik != 0.0) { + if (compute_fatlong) + n_naiks = 2; + else + eps_naik = 0.0; // to avoid potential headaches } } @@ -1474,17 +1467,10 @@ void setQudaStaggeredEigTestParams() } // Set n_naiks to 2 if eps_naik != 0.0 - if (dslash_type == QUDA_ASQTAD_DSLASH) { - if (eps_naik != 0.0) { - if (compute_fatlong) { - n_naiks = 2; - printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n"); - } else { - eps_naik = 0.0; - printfQuda("Not computing fat-long, ignoring epsilon correction.\n"); - } - } else { - printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n"); - } + if (eps_naik != 0.0) { + if (compute_fatlong) + n_naiks = 2; + else + eps_naik = 0.0; // to avoid potential headaches } } From dd67aa0a932bd8be45d50d93893d3142b04ed293 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 12 Sep 2023 12:47:31 -0400 Subject: [PATCH 08/53] Massive cleanup for staggered_invert/eigensolve_test, removed all enumerated test types, set new defaults, improved command line arg documentation --- tests/host_reference/dslash_reference.cpp | 30 ++--- tests/host_reference/dslash_reference.h | 2 +- tests/staggered_eigensolve_test.cpp | 36 ++---- tests/staggered_invert_test.cpp | 58 ++------- tests/utils/command_line_params.cpp | 12 +- tests/utils/host_utils.h | 3 +- tests/utils/misc.cpp | 17 --- tests/utils/misc.h | 1 - tests/utils/set_params.cpp | 142 ++++++++-------------- 9 files changed, 100 insertions(+), 201 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 7841d63d4a..2afc145395 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -744,30 +744,30 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou } double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param, + quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param, int shift) { int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; - switch (test_type) { - case 0: // full parity solution, full parity system - case 1: // full parity solution, solving EVEN EVEN prec system - case 2: // full parity solution, solving ODD ODD prec system + if (inv_param.solution_type == QUDA_MAT_SOLUTION) { stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); // exact reason for this tbd, this isn't needed in the dslash test... if (dslash_type == QUDA_LAPLACE_DSLASH) ax(0.5 / kappa, ref.V(), ref.Length(), ref.Precision()); - break; - - case 3: // even parity solution, solving EVEN system - case 4: // odd parity solution, solving ODD system - case 5: // multi mass CG, even parity solution, solving EVEN system - case 6: // multi mass CG, odd parity solution, solving ODD system - - stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, - (test_type == 3 || test_type == 5) ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type); - break; + } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { + QudaParity parity = QUDA_INVALID_PARITY; + switch (inv_param.matpc_type) { + case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; + case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; + default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; + } + stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type); + } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { + stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type); + stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); + } else { + errorQuda("Invalid staggered solution type %d", inv_param.solution_type); } int len = 0; diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 42f90fed91..32632ddf20 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -109,7 +109,7 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv); double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaGaugeParam &gauge_param, + quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param, int shift); // i represents a "half index" into an even or odd "half lattice". diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 911d58a2f9..d5b411e0cf 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -20,10 +20,10 @@ void display_test_info() { printfQuda("running the following test:\n"); - printfQuda("prec sloppy_prec link_recon sloppy_link_recon test_type S_dimension T_dimension\n"); - printfQuda("%s %s %s %s %s %d/%d/%d %d \n", get_prec_str(prec), + printfQuda("prec sloppy_prec link_recon sloppy_link_recon S_dimension T_dimension\n"); + printfQuda("%s %s %s %s %d/%d/%d %d \n", get_prec_str(prec), get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), - get_staggered_test_type(test_type), xdim, ydim, zdim, tdim); + xdim, ydim, zdim, tdim); printfQuda("\n Eigensolver parameters\n"); printfQuda(" - solver mode %s\n", get_eig_type_str(eig_type)); @@ -60,13 +60,11 @@ void display_test_info() int main(int argc, char **argv) { - // Set a default - solve_type = QUDA_INVALID_SOLVE; + // Set defaults + setQudaStaggeredDefaultInvTestParams(); auto app = make_app(); add_eigen_option_group(app); - CLI::TransformPairs test_type_map {{"full", 0}, {"even", 3}, {"odd", 4}}; - app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map)); try { app->parse(argc, argv); @@ -80,15 +78,9 @@ int main(int argc, char **argv) // Set values for precisions via the command line. setQudaPrecisions(); - // Only these fermions are supported in this file. Ensure a reasonable default, - // ensure that the default is improved staggered - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) { - printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type), - get_dslash_str(QUDA_ASQTAD_DSLASH)); - dslash_type = QUDA_ASQTAD_DSLASH; - } - - setQudaStaggeredEigTestParams(); + // Only these fermions are supported in this file + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); display_test_info(); @@ -167,10 +159,9 @@ int main(int argc, char **argv) // QUDA eigensolver test //---------------------------------------------------------------------------- - switch (test_type) { - case 0: // full parity solution - case 3: // even - case 4: // odd + if ((solve_type == QUDA_DIRECT_SOLVE && solution_type == QUDA_MAT_SOLUTION) || + (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type == QUDA_MATPC_SOLUTION) || + (solve_type == QUDA_NORMOP_SOLVE && solution_type == QUDA_MATDAG_MAT_SOLUTION)) { // This function returns the host_evecs and host_evals pointers, populated with // the requested data, at the requested prec. All the information needed to // perfom the solve is in the eig_param container. @@ -182,9 +173,8 @@ int main(int argc, char **argv) time += (double)clock(); printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", time / CLOCKS_PER_SEC); - break; - - default: errorQuda("Unsupported test type"); + } else { + errorQuda("Unsupported combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type)); } // switch diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 210a10c176..6cec447810 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -104,6 +104,7 @@ void display_test_info() int main(int argc, char **argv) { + setQudaStaggeredDefaultInvTestParams(); setQudaDefaultMgTestParams(); // Parse command line options auto app = make_app(); @@ -111,16 +112,13 @@ int main(int argc, char **argv) add_deflation_option_group(app); add_multigrid_option_group(app); add_comms_option_group(app); - CLI::TransformPairs test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3}, - {"odd", 4}, {"mcg_even", 5}, {"mcg_odd", 6}}; - app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map)); + try { app->parse(argc, argv); } catch (const CLI::ParseError &e) { return app->exit(e); } setVerbosity(verbosity); - if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE; if (inv_deflate && inv_multigrid) { printfQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n"); @@ -135,13 +133,9 @@ int main(int argc, char **argv) initRand(); - // Only these fermions are supported in this file. Ensure a reasonable default, - // ensure that the default is improved staggered - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) { - printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type), - get_dslash_str(QUDA_ASQTAD_DSLASH)); - dslash_type = QUDA_ASQTAD_DSLASH; - } + // Only these fermions are supported in this file + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); // Need to add support for LAPLACE MG? if (inv_multigrid) { @@ -151,9 +145,6 @@ int main(int argc, char **argv) } } - // Deduce operator, solution, and operator preconditioning types - if (!inv_multigrid) setQudaStaggeredInvTestParams(); - display_test_info(); // Set QUDA internal parameters @@ -310,27 +301,13 @@ int main(int argc, char **argv) std::vector gflops(Nsrc); std::vector iter(Nsrc); - // Pointers for split grid tests - std::vector _h_b(Nsrc, nullptr); - std::vector _h_x(Nsrc, nullptr); + // Populate `in` with random noise + for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); } // QUDA invert test //---------------------------------------------------------------------------- - if (test_type >= 0 && test_type <= 4) { - // case 0: // full parity solution, full parity system - // case 1: // full parity solution, solving EVEN EVEN prec system - // case 2: // full parity solution, solving ODD ODD prec system - // case 3: // even parity solution, solving EVEN system - // case 4: // odd parity solution, solving ODD system - - if (multishift != 1) { - printfQuda("Multishift not supported for test %d\n", test_type); - exit(0); - } - - for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); } - + if (multishift == 1) { if (!use_split_grid) { for (int k = 0; k < Nsrc; k++) { if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; @@ -363,18 +340,12 @@ int main(int argc, char **argv) for (int k = 0; k < Nsrc; k++) { if (verify_results) - verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0); + verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, inv_param, 0); } - } else if (test_type == 5 || test_type == 6) { - // case 5: // multi mass CG, even parity solution, solving EVEN system - // case 6: // multi mass CG, odd parity solution, solving ODD system - + } else if (multishift > 1) { if (use_split_grid) errorQuda("Multishift currently doesn't support split grid.\n"); - if (multishift < 2) - errorQuda("Multishift inverter requires more than one shift, multishift = %d\n", multishift); - inv_param.num_offset = multishift; // Prepare vectors for masses @@ -418,11 +389,11 @@ int main(int argc, char **argv) for (int i = 0; i < multishift; i++) { printfQuda("%dth solution: mass=%f, ", i, masses[i]); - verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i); + verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, inv_param, i); } } } else { - errorQuda("Unsupported test type"); + errorQuda("Invalid number of shifts %d", multishift); } // switch // Compute timings @@ -457,11 +428,6 @@ int main(int argc, char **argv) delete ref; delete tmp; - if (use_split_grid) { - for (auto p : _h_b) { delete p; } - for (auto p : _h_x) { delete p; } - } - // Finalize the QUDA library endQuda(); diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index c56ec2bd14..99e6fb4cbc 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -471,7 +471,7 @@ std::shared_ptr make_app(std::string app_description, std::string app_n quda_app->add_option("--device", device_ordinal, "Set the CUDA device to use (default 0, single GPU only)") ->check(CLI::Range(0, 16)); - quda_app->add_option("--dslash-type", dslash_type, "Set the dslash type") + quda_app->add_option("--dslash-type", dslash_type, "Set the dslash type (default wilson or asqtad as appropriate)") ->transform(CLI::QUDACheckedTransformer(dslash_type_map)); quda_app->add_option("--epsilon", epsilon, "Twisted-Mass flavor twist of Dirac operator (default 0.01)"); @@ -499,7 +499,7 @@ std::shared_ptr make_app(std::string app_description, std::string app_n ->transform(CLI::QUDACheckedTransformer(mass_normalization_map)); quda_app - ->add_option("--matpc", matpc_type, "Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym)") + ->add_option("--matpc", matpc_type, "Matrix preconditioning type (even-even (default), odd-odd, even-even-asym, odd-odd-asym)") ->transform(CLI::QUDACheckedTransformer(matpc_type_map)); quda_app->add_option("--msrc", Msrc, "Used for testing non-square block blas routines where nsrc defines the other dimension"); @@ -600,7 +600,7 @@ std::shared_ptr make_app(std::string app_description, std::string app_n quda_app ->add_option( "--solution-type", solution_type, - "The solution we desire (mat (default), mat-dag-mat, mat-pc, mat-pc-dag-mat-pc (default for multi-shift))") + "The solution we desire (mat (default for Wilson-type), mat-dag-mat, mat-pc (default for staggered-type), mat-pc-dag-mat-pc (default for Wilson-type multi-shift))") ->transform(CLI::QUDACheckedTransformer(solution_type_map)); quda_app @@ -610,7 +610,7 @@ std::shared_ptr make_app(std::string app_description, std::string app_n quda_app ->add_option("--solve-type", solve_type, - "The type of solve to do (direct, direct-pc, normop, normop-pc, normerr, normerr-pc)") + "The type of solve to do (direct, direct-pc (default for staggered-type), normop, normop-pc (default for Wilson-type), normerr, normerr-pc)") ->transform(CLI::QUDACheckedTransformer(solve_type_map)); quda_app ->add_option("--solver-ext-lib-type", solver_ext_lib, "Set external library for the solvers (default Eigen library)") @@ -752,8 +752,8 @@ void add_eigen_option_group(std::shared_ptr quda_app) opgroup->add_option("--eig-use-dagger", eig_use_dagger, "Solve the Mdag problem instead of M (MMdag if eig-use-normop == true) (default false)"); opgroup->add_option("--eig-use-normop", eig_use_normop, - "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false)"); - opgroup->add_option("--eig-use-pc", eig_use_pc, "Solve the Even-Odd preconditioned problem (default false)"); + "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false for Wilson-type, true for staggered-type)"); + opgroup->add_option("--eig-use-pc", eig_use_pc, "Solve the Even-Odd preconditioned problem (default false for Wilson-type, true for staggered-type)"); opgroup->add_option("--eig-use-poly-acc", eig_use_poly_acc, "Use Chebyshev polynomial acceleration in the eigensolver"); } diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 1d2692b25e..23aa99e6df 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -42,8 +42,7 @@ extern QudaPrecision &cuda_prec_ritz; // Set some basic parameters via command line or use defaults // Implemented in set_params.cpp -void setQudaStaggeredEigTestParams(); -void setQudaStaggeredInvTestParams(); +void setQudaStaggeredDefaultInvTestParams(); // Staggered gauge field utils //------------------------------------------------------ diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index c07fdb9e5d..f6e7a6a394 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -96,23 +96,6 @@ const char *get_test_type(int t) return ret; } -const char *get_staggered_test_type(int t) -{ - const char *ret; - switch (t) { - case 0: ret = "full"; break; - case 1: ret = "full_ee_prec"; break; - case 2: ret = "full_oo_prec"; break; - case 3: ret = "even"; break; - case 4: ret = "odd"; break; - case 5: ret = "mcg_even"; break; - case 6: ret = "mcg_odd"; break; - default: ret = "unknown"; break; - } - - return ret; -} - const char *get_dslash_str(QudaDslashType type) { const char *ret; diff --git a/tests/utils/misc.h b/tests/utils/misc.h index bac9cf69c9..bf9a8d3039 100644 --- a/tests/utils/misc.h +++ b/tests/utils/misc.h @@ -7,7 +7,6 @@ const char *get_recon_str(QudaReconstructType recon); const char *get_prec_str(QudaPrecision prec); const char *get_gauge_order_str(QudaGaugeFieldOrder order); const char *get_test_type(int t); -const char *get_staggered_test_type(int t); const char *get_unitarization_str(bool svd_only); const char *get_mass_normalization_str(QudaMassNormalization); const char *get_verbosity_str(QudaVerbosity); diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index aa95dbbd34..404401c2d6 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -1379,98 +1379,60 @@ void setDeflationParam(QudaEigParam &df_param) df_param.partfile = eig_partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; } -void setQudaStaggeredInvTestParams() +/**********/ +// The enumerated staggered tests have been removed, but for reference: +// +// Test 0: +// solve_type = QUDA_DIRECT_SOLVE +// matpc_type = QUDA_MATPC_EVEN_EVEN (doesn't matter) +// solution_type = QUDA_MAT_SOLUTION +// +// Test 1: +// solve_type = QUDA_DIRECT_PC_SOLVE +// matpc_type = QUDA_MATPC_EVEN_EVEN +// solution_type = QUDA_MAT_SOLUTION +// +// Test 2: +// solve_type = QUDA_DIRECT_PC_SOLVE +// matpc_type = QUDA_MATPC_ODD_ODD +// solution_type = QUDA_MAT_SOLUTION +// +// Test 3: +// solve_type = QUDA_DIRECT_PC_SOLVE +// matpc_type = QUDA_MATPC_EVEN_EVEN +// solution_type = QUDA_MATPC_SOLUTION +// +// Test 4: +// solve_type = QUDA_DIRECT_PC_SOLVE +// matpc_type = QUDA_MATPC_ODD_ODD +// solution_type = QUDA_MATPC_SOLUTION +// +// Test 5: multi-shift +// solve_type = QUDA_DIRECT_PC_SOLVE +// matpc_type = QUDA_MATPC_EVEN_EVEN +// solution_type = QUDA_MATPC_SOLUTION +// +// Test 6: multi-shift +// solve_type = QUDA_DIRECT_PC_SOLVE +// matpc_type = QUDA_MATPC_ODD_ODD +// solution_type = QUDA_MATPC_SOLUTION +/**********/ + +void setQudaStaggeredDefaultInvTestParams() { - if (dslash_type == QUDA_LAPLACE_DSLASH) { - if (test_type != 0) { errorQuda("Test type %d is not supported for the Laplace operator.\n", test_type); } + // Set some meaningful defaults for staggered tests - solve_type = QUDA_DIRECT_SOLVE; - solution_type = QUDA_MAT_SOLUTION; - matpc_type = QUDA_MATPC_EVEN_EVEN; // doesn't matter + // Default to the ASQTAD dslash + dslash_type = QUDA_ASQTAD_DSLASH; - } else { - - if (test_type == 0 && (inv_type == QUDA_CG_INVERTER || inv_type == QUDA_PCG_INVERTER) - && solve_type != QUDA_NORMOP_SOLVE && solve_type != QUDA_DIRECT_PC_SOLVE) { - warningQuda("The full spinor staggered operator (test 0) can't be inverted with (P)CG. Switching to BiCGstab.\n"); - inv_type = QUDA_BICGSTAB_INVERTER; - } - - if (solve_type == QUDA_INVALID_SOLVE) { - if (test_type == 0) { - solve_type = QUDA_DIRECT_SOLVE; - } else { - solve_type = QUDA_DIRECT_PC_SOLVE; - } - } - - if (test_type == 1 || test_type == 3 || test_type == 5) { - matpc_type = QUDA_MATPC_EVEN_EVEN; - } else if (test_type == 2 || test_type == 4 || test_type == 6) { - matpc_type = QUDA_MATPC_ODD_ODD; - } else if (test_type == 0) { - matpc_type = QUDA_MATPC_EVEN_EVEN; // it doesn't matter - } - - if (test_type == 0 || test_type == 1 || test_type == 2) { - solution_type = QUDA_MAT_SOLUTION; - } else { - solution_type = QUDA_MATPC_SOLUTION; - } - } - - if (prec_sloppy == QUDA_INVALID_PRECISION) { prec_sloppy = prec; } - - if (prec_refinement_sloppy == QUDA_INVALID_PRECISION) { prec_refinement_sloppy = prec_sloppy; } - if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) { link_recon_sloppy = link_recon; } + // Default to a Schur-preconditioned CG solve + solve_type = QUDA_DIRECT_PC_SOLVE; + solution_type = QUDA_MATPC_SOLUTION; + matpc_type = QUDA_MATPC_EVEN_EVEN; + inv_type = QUDA_CG_INVERTER; - if (inv_type != QUDA_CG_INVERTER && (test_type == 5 || test_type == 6)) { - errorQuda("Preconditioning is currently not supported in multi-shift solver solvers"); - } - - // Set n_naiks to 2 if eps_naik != 0.0 - if (eps_naik != 0.0) { - if (compute_fatlong) - n_naiks = 2; - else - eps_naik = 0.0; // to avoid potential headaches - } -} - -void setQudaStaggeredEigTestParams() -{ - if (dslash_type == QUDA_LAPLACE_DSLASH) { - // LAPLACE operator path, only DIRECT solves feasible. - if (test_type != 0) { errorQuda("Test type %d is not supported for the Laplace operator.\n", test_type); } - solve_type = QUDA_DIRECT_SOLVE; - solution_type = QUDA_MAT_SOLUTION; - } else { - // STAGGERED operator path - if (solve_type == QUDA_INVALID_SOLVE) { - if (test_type == 0) { - solve_type = QUDA_DIRECT_SOLVE; - } else { - solve_type = QUDA_DIRECT_PC_SOLVE; - } - } - // If test type is not 3, it is 4 or 0. If 0, the matpc type is irrelevant - if (test_type == 3) - matpc_type = QUDA_MATPC_EVEN_EVEN; - else - matpc_type = QUDA_MATPC_ODD_ODD; - - if (test_type == 0) { - solution_type = QUDA_MAT_SOLUTION; - } else { - solution_type = QUDA_MATPC_SOLUTION; - } - } - - // Set n_naiks to 2 if eps_naik != 0.0 - if (eps_naik != 0.0) { - if (compute_fatlong) - n_naiks = 2; - else - eps_naik = 0.0; // to avoid potential headaches - } + // For an eigensolve, default to using the "regular" operator instead of the normal + // operator because the Schur operator is already HPD + eig_use_normop = QUDA_BOOLEAN_FALSE; + eig_use_pc = true; } From b39297b0f59dcb31d6f5caafd6f356f1c3856ef8 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 29 Nov 2023 15:19:51 -0800 Subject: [PATCH 09/53] Misc cleanup to make hisq_stencil_test match some conventions in staggered_dslash_test_utils --- tests/hisq_stencil_test.cpp | 168 ++++++++++------------------ tests/staggered_dslash_test_utils.h | 1 - 2 files changed, 61 insertions(+), 108 deletions(-) diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp index 98c2ae91d3..a9103e4e4b 100644 --- a/tests/hisq_stencil_test.cpp +++ b/tests/hisq_stencil_test.cpp @@ -46,8 +46,7 @@ static double max_allowed_error = 1e-11; static void hisq_test() { - - QudaGaugeParam qudaGaugeParam; + QudaGaugeParam gauge_param; initQuda(device_ordinal); @@ -55,42 +54,20 @@ static void hisq_test() errorQuda("Precision %d is unsupported in some link fattening routines\n", prec); } + if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order); + cpu_prec = prec; host_gauge_data_type_size = cpu_prec; - qudaGaugeParam = newQudaGaugeParam(); - - qudaGaugeParam.anisotropy = 1.0; - - // Fix me: must always be set to 1.0 for reasons not yet discerned. - // The tadpole coefficient gets encoded directly into the fat link - // construct coefficents. - qudaGaugeParam.tadpole_coeff = 1.0; - - qudaGaugeParam.X[0] = xdim; - qudaGaugeParam.X[1] = ydim; - qudaGaugeParam.X[2] = zdim; - qudaGaugeParam.X[3] = tdim; - setDims(qudaGaugeParam.X); + gauge_param = newQudaGaugeParam(); - qudaGaugeParam.cpu_prec = cpu_prec; - qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec; + setStaggeredGaugeParam(gauge_param); - if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order); - - qudaGaugeParam.gauge_order = gauge_order; - qudaGaugeParam.type = QUDA_WILSON_LINKS; - qudaGaugeParam.reconstruct = qudaGaugeParam.reconstruct_sloppy = link_recon; - qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T; - qudaGaugeParam.staggered_phase_type = QUDA_STAGGERED_PHASE_MILC; - qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; - qudaGaugeParam.ga_pad = 0; + setDims(gauge_param.X); - // Needed for unitarization, following "unitarize_link_test.cpp" - GaugeFieldParam gParam(qudaGaugeParam); - gParam.link_type = QUDA_GENERAL_LINKS; - gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - gParam.order = gauge_order; + gauge_param.cpu_prec = cpu_prec; + gauge_param.cuda_prec_sloppy = cuda_prec; + gauge_param.reconstruct_sloppy = link_recon; /////////////////////////////////////////////////////////////// // Set up the coefficients for each part of the HISQ stencil // @@ -148,21 +125,14 @@ static void hisq_test() // Input links // ///////////////// - void *sitelink[4]; - for (int i = 0; i < 4; i++) sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size); + void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr}; + for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size); - void *milc_sitelink; - milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + void *milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // Note: this could be replaced with loading a gauge field - createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 0); // 0 -> no phases - for (int i = 0; i < V; ++i) { - for (int dir = 0; dir < 4; ++dir) { - char *src = (char *)sitelink[dir]; - memcpy((char *)milc_sitelink + (i * 4 + dir) * gauge_site_size * host_gauge_data_type_size, - src + i * gauge_site_size * host_gauge_data_type_size, gauge_site_size * host_gauge_data_type_size); - } - } + createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases + reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); ////////////////////// // Perform GPU test // @@ -187,7 +157,7 @@ static void hisq_test() // Tuning run... { printfQuda("Tuning...\n"); - computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &qudaGaugeParam); + computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &gauge_param); } struct timeval t0, t1; @@ -198,11 +168,11 @@ static void hisq_test() // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! // Create V links (fat7 links) and W links (unitarized V links), 1st path table set - computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &qudaGaugeParam); + computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param); if (n_naiks > 1) { // Create Naiks, 3rd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &qudaGaugeParam); + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param); // Rescale+copy Naiks into Naik field cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); @@ -213,7 +183,7 @@ static void hisq_test() } // Create X and long links, 2nd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &qudaGaugeParam); + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param); if (n_naiks > 1) { // Add into Naik field @@ -229,24 +199,26 @@ static void hisq_test() // Perform CPU Build // /////////////////////// - void *long_reflink[4]; // Long link for fermion with zero epsilon - void *fat_reflink[4]; // Fat link for fermion with zero epsilon + // fat and long links for fermions with zero epsilon + void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; + void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; for (int i = 0; i < 4; i++) { - long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); } - void *long_reflink_eps[4]; // Long link for fermion with non-zero epsilon - void *fat_reflink_eps[4]; // Fat link for fermion with non-zero epsilon + // fat and long links for fermions with non-zero epsilon + void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; if (n_naiks > 1) { for (int i = 0; i < 4; i++) { - long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); } } if (verify_results) { - computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, sitelink, &qudaGaugeParam, + computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param, act_paths, eps_naik); } @@ -254,45 +226,25 @@ static void hisq_test() // Layout change for fatlink, fatlink_eps, longlink, longlink_eps // //////////////////////////////////////////////////////////////////// - void *myfatlink[4]; - void *mylonglink[4]; - void *myfatlink_eps[4]; - void *mylonglink_eps[4]; + void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; + void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; + void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; for (int i = 0; i < 4; i++) { - - myfatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - mylonglink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - memset(myfatlink[i], 0, V * gauge_site_size * host_gauge_data_type_size); - memset(mylonglink[i], 0, V * gauge_site_size * host_gauge_data_type_size); - + qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); if (n_naiks > 1) { - myfatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - mylonglink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - memset(myfatlink_eps[i], 0, V * gauge_site_size * host_gauge_data_type_size); - memset(mylonglink_eps[i], 0, V * gauge_site_size * host_gauge_data_type_size); + qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); } } - for (int i = 0; i < V; i++) { - for (int dir = 0; dir < 4; dir++) { - char *src = ((char *)fatlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size; - char *dst = ((char *)myfatlink[dir]) + i * gauge_site_size * host_gauge_data_type_size; - memcpy(dst, src, gauge_site_size * host_gauge_data_type_size); - - src = ((char *)longlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size; - dst = ((char *)mylonglink[dir]) + i * gauge_site_size * host_gauge_data_type_size; - memcpy(dst, src, gauge_site_size * host_gauge_data_type_size); - - if (n_naiks > 1) { - src = ((char *)fatlink_eps) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size; - dst = ((char *)myfatlink_eps[dir]) + i * gauge_site_size * host_gauge_data_type_size; - memcpy(dst, src, gauge_site_size * host_gauge_data_type_size); - - src = ((char *)longlink_eps) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size; - dst = ((char *)mylonglink_eps[dir]) + i * gauge_site_size * host_gauge_data_type_size; - memcpy(dst, src, gauge_site_size * host_gauge_data_type_size); - } - } + reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + + if (n_naiks > 1) { + reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); } ////////////////////////////// @@ -303,20 +255,20 @@ static void hisq_test() printfQuda("Checking fat links...\n"); int res = 1; for (int dir = 0; dir < 4; dir++) { - res &= compare_floats(fat_reflink[dir], myfatlink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec); + res &= compare_floats(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec); } - strong_check_link(myfatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); + strong_check_link(qdp_fatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec); printfQuda("Fat-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); printfQuda("Checking long links...\n"); res = 1; for (int dir = 0; dir < 4; ++dir) { - res &= compare_floats(long_reflink[dir], mylonglink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec); + res &= compare_floats(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec); } - strong_check_link(mylonglink, "GPU results: ", long_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); + strong_check_link(qdp_longlink, "GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec); printfQuda("Long-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); @@ -325,31 +277,31 @@ static void hisq_test() printfQuda("Checking fat eps_naik links...\n"); res = 1; for (int dir = 0; dir < 4; dir++) { - res &= compare_floats(fat_reflink_eps[dir], myfatlink_eps[dir], V * gauge_site_size, 1e-3, - qudaGaugeParam.cpu_prec); + res &= compare_floats(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, 1e-3, + gauge_param.cpu_prec); } - strong_check_link(myfatlink_eps, "GPU results: ", fat_reflink_eps, "CPU reference results:", V, - qudaGaugeParam.cpu_prec); + strong_check_link(qdp_fatlink_eps, "GPU results: ", fat_reflink_eps, "CPU reference results:", V, + gauge_param.cpu_prec); printfQuda("Fat-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); printfQuda("Checking long eps_naik links...\n"); res = 1; for (int dir = 0; dir < 4; ++dir) { - res &= compare_floats(long_reflink_eps[dir], mylonglink_eps[dir], V * gauge_site_size, 1e-3, - qudaGaugeParam.cpu_prec); + res &= compare_floats(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, 1e-3, + gauge_param.cpu_prec); } - strong_check_link(mylonglink_eps, "GPU results: ", long_reflink_eps, "CPU reference results:", V, - qudaGaugeParam.cpu_prec); + strong_check_link(qdp_longlink_eps, "GPU results: ", long_reflink_eps, "CPU reference results:", V, + gauge_param.cpu_prec); printfQuda("Long-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); } } // FIXME: does not include unitarization, extra naiks - int volume = qudaGaugeParam.X[0] * qudaGaugeParam.X[1] * qudaGaugeParam.X[2] * qudaGaugeParam.X[3]; + int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3]; long long flops = 61632 * (long long)niter; // Constructing V field // Constructing W field? // Constructing separate Naiks @@ -360,16 +312,16 @@ static void hisq_test() printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf); for (int i = 0; i < 4; i++) { - host_free(myfatlink[i]); - host_free(mylonglink[i]); + host_free(qdp_fatlink[i]); + host_free(qdp_longlink[i]); if (n_naiks > 1) { - host_free(myfatlink_eps[i]); - host_free(mylonglink_eps[i]); + host_free(qdp_fatlink_eps[i]); + host_free(qdp_longlink_eps[i]); } } for (int i = 0; i < 4; i++) { - host_free(sitelink[i]); + host_free(qdp_sitelink[i]); host_free(fat_reflink[i]); host_free(long_reflink[i]); if (n_naiks > 1) { @@ -433,6 +385,8 @@ int main(int argc, char **argv) if (eps_naik != 0.0) { n_naiks = 2; } + setVerbosity(verbosity); + initComms(argc, argv, gridsize_from_cmdline); display_test_info(); hisq_test(); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 264371f2f0..246dcdfea4 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -221,7 +221,6 @@ struct StaggeredDslashTestWrapper { // set verbosity prior to loadGaugeQuda setVerbosity(verbosity); - } void init() From a1049cf823960d5175f5763d223db5c2023e680f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 29 Nov 2023 21:37:58 -0800 Subject: [PATCH 10/53] Significant refactoring to hist_stencil_test, getting closer to simple test and ctest flavors --- tests/hisq_stencil_test.cpp | 506 ++++++++++++++++++++---------------- 1 file changed, 288 insertions(+), 218 deletions(-) diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp index a9103e4e4b..77f7933a40 100644 --- a/tests/hisq_stencil_test.cpp +++ b/tests/hisq_stencil_test.cpp @@ -1,21 +1,16 @@ #include #include #include -#include -#include "quda.h" -#include "gauge_field.h" -#include "host_utils.h" +#include +#include +#include #include -#include "misc.h" -#include "util_quda.h" -#include "malloc_quda.h" +#include #include -#include "ks_improved_force.h" +#include -#ifdef MULTI_GPU -#include "comm_quda.h" -#endif +#include #define TDIFF(a, b) (b.tv_sec - a.tv_sec + 0.000001 * (b.tv_usec - a.tv_usec)) @@ -36,222 +31,322 @@ static double svd_rel_error = 1e-4; static double svd_abs_error = 1e-4; static double max_allowed_error = 1e-11; -/*--------------------------------------------------------------------*/ -// Some notation: -// U -- original link, SU(3), copied to "field" from "site" -// V -- after 1st level of smearing, non-SU(3) -// W -- unitarized, SU(3) -// X -- after 2nd level of smearing, non-SU(3) -/*--------------------------------------------------------------------*/ +struct HisqStencilTestWrapper { -static void hisq_test() -{ - QudaGaugeParam gauge_param; + static inline QudaGaugeParam gauge_param; - initQuda(device_ordinal); + // staple coefficients for different portions of the HISQ stencil build + static inline std::array, 3> act_paths; - if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION) { - errorQuda("Precision %d is unsupported in some link fattening routines\n", prec); - } + // initial links in MILC order + static inline void* milc_sitelink = nullptr; - if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order); + // storage for CPU reference fat and long links w/zero Naik + static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; - cpu_prec = prec; - host_gauge_data_type_size = cpu_prec; + // storage for CPU reference fat and long links w/non-zero Naik + static inline void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - gauge_param = newQudaGaugeParam(); + // Paths for step 1: + static inline void *vlink = nullptr; + static inline void *wlink = nullptr; - setStaggeredGaugeParam(gauge_param); + // Paths for step 2: + static inline void *fatlink = nullptr; + static inline void *longlink = nullptr; - setDims(gauge_param.X); + // Place to accumulate Naiks + static inline void *fatlink_eps = nullptr; + static inline void *longlink_eps = nullptr; - gauge_param.cpu_prec = cpu_prec; - gauge_param.cuda_prec_sloppy = cuda_prec; - gauge_param.reconstruct_sloppy = link_recon; + static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - /////////////////////////////////////////////////////////////// - // Set up the coefficients for each part of the HISQ stencil // - /////////////////////////////////////////////////////////////// + void init_test() { + cpu_prec = prec; + host_gauge_data_type_size = cpu_prec; - // Reference: "generic_ks/imp_actions/hisq/hisq_action.h", - // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c + gauge_param = newQudaGaugeParam(); + setStaggeredGaugeParam(gauge_param); - double u1 = 1.0 / tadpole_factor; - double u2 = u1 * u1; - double u4 = u2 * u2; - double u6 = u4 * u2; + static bool first_time = true; + if (first_time) { + init_host(); + first_time = false; + } + } - std::array, 3> act_paths; + void init_host() { + setDims(gauge_param.X); + dw_setDims(gauge_param.X, 1); + + /////////////////////////////////////////////////////////////// + // Set up the coefficients for each part of the HISQ stencil // + /////////////////////////////////////////////////////////////// + + // Reference: "generic_ks/imp_actions/hisq/hisq_action.h", + // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c + + double u1 = 1.0 / tadpole_factor; + double u2 = u1 * u1; + double u4 = u2 * u2; + double u6 = u4 * u2; + + // First path: create V, W links + act_paths[0] = { + (1.0 / 8.0), /* one link */ + u2 * (0.0), /* Naik */ + u2 * (-1.0 / 8.0) * 0.5, /* simple staple */ + u4 * (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ + u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ + u4 * (0.0) /* Lepage term */ + }; + + // Second path: create X, long links + act_paths[1] = { + ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */ + /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ + (-1.0 / 24.0), /* Naik */ + (-1.0 / 8.0) * 0.5, /* simple staple */ + (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ + (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ + (-2.0 / 16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */ + }; + + // Paths for epsilon corrections. Not used if n_naiks = 1. + act_paths[2] = { + (1.0 / 8.0), /* one link b/c of Naik */ + (-1.0 / 24.0), /* Naik */ + 0.0, /* simple staple */ + 0.0, /* displace link in two directions */ + 0.0, /* displace link in three directions */ + 0.0 /* Lepage term */ + }; + + //////////////////////////////////// + // Set unitarization coefficients // + //////////////////////////////////// + + setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, + svd_abs_error); + + ///////////////// + // Input links // + ///////////////// + + void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr}; + for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size); + + milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + + // Note: this could be replaced with loading a gauge field + createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases + reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + + /////////////////////// + // Perform CPU Build // + /////////////////////// - // First path: create V, W links - act_paths[0] = { - (1.0 / 8.0), /* one link */ - u2 * (0.0), /* Naik */ - u2 * (-1.0 / 8.0) * 0.5, /* simple staple */ - u4 * (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ - u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ - u4 * (0.0) /* Lepage term */ - }; + for (int i = 0; i < 4; i++) { + // fat and long links for fermions with zero epsilon + fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + + // fat and long links for fermions with non-zero epsilon + if (n_naiks > 1) { + fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } + } - // Second path: create X, long links - act_paths[1] = { - ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */ - /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ - (-1.0 / 24.0), /* Naik */ - (-1.0 / 8.0) * 0.5, /* simple staple */ - (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ - (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ - (-2.0 / 16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */ - }; + computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param, + act_paths, eps_naik); - // Paths for epsilon corrections. Not used if n_naiks = 1. - act_paths[2] = { - (1.0 / 8.0), /* one link b/c of Naik */ - (-1.0 / 24.0), /* Naik */ - 0.0, /* simple staple */ - 0.0, /* displace link in two directions */ - 0.0, /* displace link in three directions */ - 0.0 /* Lepage term */ - }; + /////////////////////////////////////////////////////// + // Allocate host storage for fields built on the GPU // + /////////////////////////////////////////////////////// - //////////////////////////////////// - // Set unitarization coefficients // - //////////////////////////////////// + // Paths for step 1: + vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links + wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links - setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, - svd_abs_error); + // Paths for step 2: + fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final fat ("X") links + longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links - ///////////////// - // Input links // - ///////////////// + // Place to accumulate Naiks + if (n_naiks > 1) { + fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon fat links + longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks + } - void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr}; - for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size); + // QDP order fields + for (int i = 0; i < 4; i++) { + qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + if (n_naiks > 1) { + qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } + } - void *milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + ///////////////////////////////////////////////////////// + // Free allocations that are only needed for CPU setup // + ///////////////////////////////////////////////////////// - // Note: this could be replaced with loading a gauge field - createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases - reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + for (int i = 0; i < 4; i++) + host_free(qdp_sitelink[i]); + } - ////////////////////// - // Perform GPU test // - ////////////////////// + static void destroy() { + if (milc_sitelink) host_free(milc_sitelink); - // Paths for step 1: - void *vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links - void *wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links + for (int i = 0; i < 4; i++) { + host_free(fat_reflink[i]); + host_free(long_reflink[i]); + if (n_naiks > 1) { + host_free(fat_reflink_eps[i]); + host_free(long_reflink_eps[i]); + } + } - // Paths for step 2: - void *fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final fat ("X") links - void *longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links + // Clean up GPU compute links + host_free(vlink); + host_free(wlink); + host_free(fatlink); + host_free(longlink); - // Place to accumulate Naiks - void *fatlink_eps = nullptr; - void *longlink_eps = nullptr; - if (n_naiks > 1) { - fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon fat links - longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks - } + if (n_naiks > 1) { + host_free(fatlink_eps); + host_free(longlink_eps); + } - // Tuning run... - { - printfQuda("Tuning...\n"); - computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &gauge_param); + for (int i = 0; i < 4; i++) { + host_free(qdp_fatlink[i]); + host_free(qdp_longlink[i]); + if (n_naiks > 1) { + host_free(qdp_fatlink_eps[i]); + host_free(qdp_longlink_eps[i]); + } + } + +#ifdef MULTI_GPU + exchange_llfat_cleanup(); +#endif } - struct timeval t0, t1; - printfQuda("Running %d iterations of computation\n", niter); - gettimeofday(&t0, NULL); - for (int n = 0; n < niter; n++) { + /*--------------------------------------------------------------------*/ + // Some notation: + // U -- original link, SU(3), copied to "field" from "site" + // V -- after 1st level of smearing, non-SU(3) + // W -- unitarized, SU(3) + // X -- after 2nd level of smearing, non-SU(3) + /*--------------------------------------------------------------------*/ - // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! + double llfatCUDA(int niter) { + host_timer_t host_timer; - // Create V links (fat7 links) and W links (unitarized V links), 1st path table set - computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param); + comm_barrier(); + host_timer.start(); - if (n_naiks > 1) { - // Create Naiks, 3rd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param); - - // Rescale+copy Naiks into Naik field - cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); - cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size); - } else { - memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); - memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); - } + for (int i = 0; i < niter; i++) { + // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! - // Create X and long links, 2nd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param); + // Create V links (fat7 links) and W links (unitarized V links), 1st path table set + computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param); - if (n_naiks > 1) { - // Add into Naik field - cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size); - cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size); + if (n_naiks > 1) { + // Create Naiks, 3rd path table set + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param); + + // Rescale+copy Naiks into Naik field + cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); + cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size); + } else { + memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); + memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); + } + + // Create X and long links, 2nd path table set + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param); + + if (n_naiks > 1) { + // Add into Naik field + cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size); + cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size); + } } + + host_timer.stop(); + + return host_timer.last(); } - gettimeofday(&t1, NULL); - double secs = TDIFF(t0, t1); + void run_test(int niter, bool print_metrics = false) { + ////////////////////// + // Perform GPU test // + ////////////////////// - /////////////////////// - // Perform CPU Build // - /////////////////////// + printfQuda("Tuning...\n"); + llfatCUDA(1); - // fat and long links for fermions with zero epsilon - void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; - for (int i = 0; i < 4; i++) { - fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - } + auto flops0 = quda::Tunable::flops_global(); + auto bytes0 = quda::Tunable::bytes_global(); - // fat and long links for fermions with non-zero epsilon - void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - if (n_naiks > 1) { - for (int i = 0; i < 4; i++) { - fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - } - } + printfQuda("Running %d iterations of computation\n", niter); + double secs = llfatCUDA(niter); - if (verify_results) { - computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param, - act_paths, eps_naik); - } + unsigned long long flops = (quda::Tunable::flops_global() - flops0); + unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0); - //////////////////////////////////////////////////////////////////// - // Layout change for fatlink, fatlink_eps, longlink, longlink_eps // - //////////////////////////////////////////////////////////////////// - - void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - for (int i = 0; i < 4; i++) { - qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - if (n_naiks > 1) { - qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + if (print_metrics) { + // FIXME: does not include unitarization, extra naiks + int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3]; + //long long flops = 61632 * (long long)niter; // Constructing V field + // Constructing W field? + // Constructing separate Naiks + //flops += 61632 * (long long)niter; // Constructing X field + //flops += (252 * 4) * (long long)niter; // long-link contribution + + printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter); + + printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter, + (flops / niter) / volume, (bytes / niter) / volume); + + double gflops = 1.0e-9 * flops / secs; + printfQuda("GFLOPS = %f\n", gflops); + + double gbytes = 1.0e-9 * bytes / secs; + printfQuda("GBYTES = %f\n", gbytes); + + // Old metric + //double perf = flops / (secs * 1024 * 1024 * 1024); + //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf); } } - reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + void verify() + { + //////////////////////////////////////////////////////////////////// + // Layout change for fatlink, fatlink_eps, longlink, longlink_eps // + //////////////////////////////////////////////////////////////////// - if (n_naiks > 1) { - reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - } + reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + + if (n_naiks > 1) { + reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + } - ////////////////////////////// - // Perform the verification // - ////////////////////////////// + ////////////////////////////// + // Perform the verification // + ////////////////////////////// - if (verify_results) { printfQuda("Checking fat links...\n"); int res = 1; for (int dir = 0; dir < 4; dir++) { @@ -299,52 +394,22 @@ static void hisq_test() printfQuda("Long-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); } } +}; - // FIXME: does not include unitarization, extra naiks - int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3]; - long long flops = 61632 * (long long)niter; // Constructing V field - // Constructing W field? - // Constructing separate Naiks - flops += 61632 * (long long)niter; // Constructing X field - flops += (252 * 4) * (long long)niter; // long-link contribution +static void hisq_test() +{ + initQuda(device_ordinal); - double perf = flops * volume / (secs * 1024 * 1024 * 1024); - printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf); + HisqStencilTestWrapper test_wrapper; - for (int i = 0; i < 4; i++) { - host_free(qdp_fatlink[i]); - host_free(qdp_longlink[i]); - if (n_naiks > 1) { - host_free(qdp_fatlink_eps[i]); - host_free(qdp_longlink_eps[i]); - } - } + test_wrapper.init_test(); - for (int i = 0; i < 4; i++) { - host_free(qdp_sitelink[i]); - host_free(fat_reflink[i]); - host_free(long_reflink[i]); - if (n_naiks > 1) { - host_free(fat_reflink_eps[i]); - host_free(long_reflink_eps[i]); - } - } + test_wrapper.run_test(niter, true); - // Clean up GPU compute links - host_free(vlink); - host_free(wlink); - host_free(fatlink); - host_free(longlink); + test_wrapper.verify(); - if (n_naiks > 1) { - host_free(fatlink_eps); - host_free(longlink_eps); - } + test_wrapper.destroy(); - if (milc_sitelink) host_free(milc_sitelink); -#ifdef MULTI_GPU - exchange_llfat_cleanup(); -#endif endQuda(); } @@ -383,6 +448,11 @@ int main(int argc, char **argv) return app->exit(e); } + if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION) + errorQuda("Precision %d is unsupported in some link fattening routines\n", prec); + + if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order); + if (eps_naik != 0.0) { n_naiks = 2; } setVerbosity(verbosity); From 14f1407542f49c152a2e51c8dc542201a508c4a7 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 29 Nov 2023 22:37:25 -0800 Subject: [PATCH 11/53] hisq_stencil_test now runs via gtest, creating a ctest is outstanding --- tests/hisq_stencil_test.cpp | 469 ++++---------------------------- tests/hisq_stencil_test_utils.h | 402 +++++++++++++++++++++++++++ 2 files changed, 458 insertions(+), 413 deletions(-) create mode 100644 tests/hisq_stencil_test_utils.h diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp index 77f7933a40..138bba9eea 100644 --- a/tests/hisq_stencil_test.cpp +++ b/tests/hisq_stencil_test.cpp @@ -1,435 +1,75 @@ -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#define TDIFF(a, b) (b.tv_sec - a.tv_sec + 0.000001 * (b.tv_usec - a.tv_usec)) +#include "hisq_stencil_test_utils.h" using namespace quda; -// Number of naiks. If eps_naik is 0.0, we only need -// to construct one naik. -static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER; - -// The file "generic_ks/fermion_links_hisq_load_milc.c" -// within MILC is the ultimate reference for what's going on here. - -// Unitarization coefficients -static double unitarize_eps = 1e-6; -static bool reunit_allow_svd = true; -static bool reunit_svd_only = false; -static double svd_rel_error = 1e-4; -static double svd_abs_error = 1e-4; -static double max_allowed_error = 1e-11; - -struct HisqStencilTestWrapper { - - static inline QudaGaugeParam gauge_param; - - // staple coefficients for different portions of the HISQ stencil build - static inline std::array, 3> act_paths; - - // initial links in MILC order - static inline void* milc_sitelink = nullptr; - - // storage for CPU reference fat and long links w/zero Naik - static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; - static inline void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; - - // storage for CPU reference fat and long links w/non-zero Naik - static inline void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - static inline void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - - // Paths for step 1: - static inline void *vlink = nullptr; - static inline void *wlink = nullptr; - - // Paths for step 2: - static inline void *fatlink = nullptr; - static inline void *longlink = nullptr; - - // Place to accumulate Naiks - static inline void *fatlink_eps = nullptr; - static inline void *longlink_eps = nullptr; - - static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; - static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; - static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - - void init_test() { - cpu_prec = prec; - host_gauge_data_type_size = cpu_prec; - - gauge_param = newQudaGaugeParam(); - setStaggeredGaugeParam(gauge_param); - - static bool first_time = true; - if (first_time) { - init_host(); - first_time = false; - } +class HisqStencilTest : public ::testing::Test +{ +protected: + HisqStencilTestWrapper hisq_stencil_test_wrapper; + + void display_test_info() { + printfQuda("running the following test:\n"); + printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n"); + printfQuda("%s %s %d/%d/%d/ %d %s \n", + get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order)); + printfQuda("Grid partition info: X Y Z T\n"); + printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), + dimPartitioned(3)); + printfQuda("Number of Naiks: %d\n", n_naiks); } - void init_host() { - setDims(gauge_param.X); - dw_setDims(gauge_param.X, 1); - - /////////////////////////////////////////////////////////////// - // Set up the coefficients for each part of the HISQ stencil // - /////////////////////////////////////////////////////////////// - - // Reference: "generic_ks/imp_actions/hisq/hisq_action.h", - // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c - - double u1 = 1.0 / tadpole_factor; - double u2 = u1 * u1; - double u4 = u2 * u2; - double u6 = u4 * u2; - - // First path: create V, W links - act_paths[0] = { - (1.0 / 8.0), /* one link */ - u2 * (0.0), /* Naik */ - u2 * (-1.0 / 8.0) * 0.5, /* simple staple */ - u4 * (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ - u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ - u4 * (0.0) /* Lepage term */ - }; - - // Second path: create X, long links - act_paths[1] = { - ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */ - /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ - (-1.0 / 24.0), /* Naik */ - (-1.0 / 8.0) * 0.5, /* simple staple */ - (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ - (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ - (-2.0 / 16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */ - }; - - // Paths for epsilon corrections. Not used if n_naiks = 1. - act_paths[2] = { - (1.0 / 8.0), /* one link b/c of Naik */ - (-1.0 / 24.0), /* Naik */ - 0.0, /* simple staple */ - 0.0, /* displace link in two directions */ - 0.0, /* displace link in three directions */ - 0.0 /* Lepage term */ - }; - - //////////////////////////////////// - // Set unitarization coefficients // - //////////////////////////////////// - - setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, - svd_abs_error); - - ///////////////// - // Input links // - ///////////////// - - void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr}; - for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size); - - milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - - // Note: this could be replaced with loading a gauge field - createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases - reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - - /////////////////////// - // Perform CPU Build // - /////////////////////// - - for (int i = 0; i < 4; i++) { - // fat and long links for fermions with zero epsilon - fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - - // fat and long links for fermions with non-zero epsilon - if (n_naiks > 1) { - fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - } - } - - computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param, - act_paths, eps_naik); - - /////////////////////////////////////////////////////// - // Allocate host storage for fields built on the GPU // - /////////////////////////////////////////////////////// - - // Paths for step 1: - vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links - wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links - - // Paths for step 2: - fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final fat ("X") links - longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links - - // Place to accumulate Naiks - if (n_naiks > 1) { - fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon fat links - longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks - } - - // QDP order fields - for (int i = 0; i < 4; i++) { - qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - if (n_naiks > 1) { - qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - } - } - - ///////////////////////////////////////////////////////// - // Free allocations that are only needed for CPU setup // - ///////////////////////////////////////////////////////// - - for (int i = 0; i < 4; i++) - host_free(qdp_sitelink[i]); +public: + virtual void SetUp() { + hisq_stencil_test_wrapper.init_test(); + display_test_info(); } - static void destroy() { - if (milc_sitelink) host_free(milc_sitelink); - - for (int i = 0; i < 4; i++) { - host_free(fat_reflink[i]); - host_free(long_reflink[i]); - if (n_naiks > 1) { - host_free(fat_reflink_eps[i]); - host_free(long_reflink_eps[i]); - } - } - - // Clean up GPU compute links - host_free(vlink); - host_free(wlink); - host_free(fatlink); - host_free(longlink); - - if (n_naiks > 1) { - host_free(fatlink_eps); - host_free(longlink_eps); - } - - for (int i = 0; i < 4; i++) { - host_free(qdp_fatlink[i]); - host_free(qdp_longlink[i]); - if (n_naiks > 1) { - host_free(qdp_fatlink_eps[i]); - host_free(qdp_longlink_eps[i]); - } - } - -#ifdef MULTI_GPU - exchange_llfat_cleanup(); -#endif + virtual void TearDown() { + hisq_stencil_test_wrapper.end(); } - /*--------------------------------------------------------------------*/ - // Some notation: - // U -- original link, SU(3), copied to "field" from "site" - // V -- after 1st level of smearing, non-SU(3) - // W -- unitarized, SU(3) - // X -- after 2nd level of smearing, non-SU(3) - /*--------------------------------------------------------------------*/ - - double llfatCUDA(int niter) { - host_timer_t host_timer; - - comm_barrier(); - host_timer.start(); - - for (int i = 0; i < niter; i++) { - // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! - - // Create V links (fat7 links) and W links (unitarized V links), 1st path table set - computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param); - - if (n_naiks > 1) { - // Create Naiks, 3rd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param); - - // Rescale+copy Naiks into Naik field - cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); - cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size); - } else { - memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); - memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); - } - - // Create X and long links, 2nd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param); - - if (n_naiks > 1) { - // Add into Naik field - cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size); - cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size); - } - } - - host_timer.stop(); - - return host_timer.last(); + static void SetUpTestCase() { + initQuda(device_ordinal); } - void run_test(int niter, bool print_metrics = false) { - ////////////////////// - // Perform GPU test // - ////////////////////// - - printfQuda("Tuning...\n"); - llfatCUDA(1); - - auto flops0 = quda::Tunable::flops_global(); - auto bytes0 = quda::Tunable::bytes_global(); - - printfQuda("Running %d iterations of computation\n", niter); - double secs = llfatCUDA(niter); - - unsigned long long flops = (quda::Tunable::flops_global() - flops0); - unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0); - - if (print_metrics) { - // FIXME: does not include unitarization, extra naiks - int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3]; - //long long flops = 61632 * (long long)niter; // Constructing V field - // Constructing W field? - // Constructing separate Naiks - //flops += 61632 * (long long)niter; // Constructing X field - //flops += (252 * 4) * (long long)niter; // long-link contribution - - printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter); - - printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter, - (flops / niter) / volume, (bytes / niter) / volume); - - double gflops = 1.0e-9 * flops / secs; - printfQuda("GFLOPS = %f\n", gflops); - - double gbytes = 1.0e-9 * bytes / secs; - printfQuda("GBYTES = %f\n", gbytes); - - // Old metric - //double perf = flops / (secs * 1024 * 1024 * 1024); - //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf); - } - } - - void verify() + // Per-test-case tear-down. + // Called after the last test in this test case. + // Can be omitted if not needed. + static void TearDownTestCase() { - //////////////////////////////////////////////////////////////////// - // Layout change for fatlink, fatlink_eps, longlink, longlink_eps // - //////////////////////////////////////////////////////////////////// - - reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - - if (n_naiks > 1) { - reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - } - - ////////////////////////////// - // Perform the verification // - ////////////////////////////// - - printfQuda("Checking fat links...\n"); - int res = 1; - for (int dir = 0; dir < 4; dir++) { - res &= compare_floats(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec); - } - - strong_check_link(qdp_fatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec); - - printfQuda("Fat-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); - - printfQuda("Checking long links...\n"); - res = 1; - for (int dir = 0; dir < 4; ++dir) { - res &= compare_floats(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, 1e-3, gauge_param.cpu_prec); - } - - strong_check_link(qdp_longlink, "GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec); - - printfQuda("Long-link test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); - - if (n_naiks > 1) { - - printfQuda("Checking fat eps_naik links...\n"); - res = 1; - for (int dir = 0; dir < 4; dir++) { - res &= compare_floats(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, 1e-3, - gauge_param.cpu_prec); - } - - strong_check_link(qdp_fatlink_eps, "GPU results: ", fat_reflink_eps, "CPU reference results:", V, - gauge_param.cpu_prec); - - printfQuda("Fat-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); - - printfQuda("Checking long eps_naik links...\n"); - res = 1; - for (int dir = 0; dir < 4; ++dir) { - res &= compare_floats(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, 1e-3, - gauge_param.cpu_prec); - } - - strong_check_link(qdp_longlink_eps, "GPU results: ", long_reflink_eps, "CPU reference results:", V, - gauge_param.cpu_prec); - - printfQuda("Long-link eps_naik test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); - } + HisqStencilTestWrapper::destroy(); + endQuda(); } }; -static void hisq_test() +TEST_F(HisqStencilTest, benchmark) { - initQuda(device_ordinal); - - HisqStencilTestWrapper test_wrapper; - - test_wrapper.init_test(); - - test_wrapper.run_test(niter, true); - - test_wrapper.verify(); - - test_wrapper.destroy(); - - endQuda(); + hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true); } -static void display_test_info() +TEST_F(HisqStencilTest, verify) { - printfQuda("running the following test:\n"); + if (!verify_results) GTEST_SKIP(); + + hisq_stencil_test_wrapper.run_test(2); + + std::array res = hisq_stencil_test_wrapper.verify(); - printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n"); - printfQuda("%s %s %d/%d/%d/ %d %s \n", - get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order)); + // extra factor of 10 b/c the norm isn't normalized + double max_dev = 10. * getTolerance(prec); - printfQuda("Grid partition info: X Y Z T\n"); - printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), - dimPartitioned(3)); + // fat link + EXPECT_LE(res[0], max_dev); - printfQuda("Number of Naiks: %d\n", n_naiks); + // long link + EXPECT_LE(res[1], max_dev); } int main(int argc, char **argv) { + // initalize google test + ::testing::InitGoogleTest(&argc, argv); + // for speed xdim = ydim = zdim = tdim = 8; @@ -437,11 +77,8 @@ int main(int argc, char **argv) link_recon = QUDA_RECONSTRUCT_NO; cpu_prec = prec = QUDA_DOUBLE_PRECISION; + // Parse command line options auto app = make_app(); - // app->get_formatter()->column_width(40); - // add_eigen_option_group(app); - // add_deflation_option_group(app); - // add_multigrid_option_group(app); try { app->parse(argc, argv); } catch (const CLI::ParseError &e) { @@ -456,9 +93,15 @@ int main(int argc, char **argv) if (eps_naik != 0.0) { n_naiks = 2; } setVerbosity(verbosity); - initComms(argc, argv, gridsize_from_cmdline); - display_test_info(); - hisq_test(); + + // Ensure gtest prints only from rank 0 + ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); + if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } + + int test_rc = RUN_ALL_TESTS(); + finalizeComms(); + + return test_rc; } diff --git a/tests/hisq_stencil_test_utils.h b/tests/hisq_stencil_test_utils.h new file mode 100644 index 0000000000..b4f7512c12 --- /dev/null +++ b/tests/hisq_stencil_test_utils.h @@ -0,0 +1,402 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace quda; + +// Number of naiks. If eps_naik is 0.0, we only need +// to construct one naik. +static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER; + +// The file "generic_ks/fermion_links_hisq_load_milc.c" +// within MILC is the ultimate reference for what's going on here. + +// Unitarization coefficients +static double unitarize_eps = 1e-6; +static bool reunit_allow_svd = true; +static bool reunit_svd_only = false; +static double svd_rel_error = 1e-4; +static double svd_abs_error = 1e-4; +static double max_allowed_error = 1e-11; + +struct HisqStencilTestWrapper { + + static inline QudaGaugeParam gauge_param; + + // staple coefficients for different portions of the HISQ stencil build + static inline std::array, 3> act_paths; + + // initial links in MILC order + static inline void* milc_sitelink = nullptr; + + // storage for CPU reference fat and long links w/zero Naik + static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *long_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; + + // storage for CPU reference fat and long links w/non-zero Naik + static inline void *fat_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *long_reflink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + + // Paths for step 1: + static inline void *vlink = nullptr; + static inline void *wlink = nullptr; + + // Paths for step 2: + static inline void *fatlink = nullptr; + static inline void *longlink = nullptr; + + // Place to accumulate Naiks + static inline void *fatlink_eps = nullptr; + static inline void *longlink_eps = nullptr; + + static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; + + void init_test() { + cpu_prec = prec; + host_gauge_data_type_size = cpu_prec; + + gauge_param = newQudaGaugeParam(); + setStaggeredGaugeParam(gauge_param); + + static bool first_time = true; + if (first_time) { + init_host(); + first_time = false; + } + } + + void init_host() { + setDims(gauge_param.X); + dw_setDims(gauge_param.X, 1); + + /////////////////////////////////////////////////////////////// + // Set up the coefficients for each part of the HISQ stencil // + /////////////////////////////////////////////////////////////// + + // Reference: "generic_ks/imp_actions/hisq/hisq_action.h", + // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c + + double u1 = 1.0 / tadpole_factor; + double u2 = u1 * u1; + double u4 = u2 * u2; + double u6 = u4 * u2; + + // First path: create V, W links + act_paths[0] = { + (1.0 / 8.0), /* one link */ + u2 * (0.0), /* Naik */ + u2 * (-1.0 / 8.0) * 0.5, /* simple staple */ + u4 * (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ + u6 * (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ + u4 * (0.0) /* Lepage term */ + }; + + // Second path: create X, long links + act_paths[1] = { + ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */ + /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ + (-1.0 / 24.0), /* Naik */ + (-1.0 / 8.0) * 0.5, /* simple staple */ + (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ + (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ + (-2.0 / 16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */ + }; + + // Paths for epsilon corrections. Not used if n_naiks = 1. + act_paths[2] = { + (1.0 / 8.0), /* one link b/c of Naik */ + (-1.0 / 24.0), /* Naik */ + 0.0, /* simple staple */ + 0.0, /* displace link in two directions */ + 0.0, /* displace link in three directions */ + 0.0 /* Lepage term */ + }; + + //////////////////////////////////// + // Set unitarization coefficients // + //////////////////////////////////// + + setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, + svd_abs_error); + + ///////////////// + // Input links // + ///////////////// + + void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr}; + for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size); + + milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + + // Note: this could be replaced with loading a gauge field + createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases + reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + + /////////////////////// + // Perform CPU Build // + /////////////////////// + + for (int i = 0; i < 4; i++) { + // fat and long links for fermions with zero epsilon + fat_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + long_reflink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + + // fat and long links for fermions with non-zero epsilon + if (n_naiks > 1) { + fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } + } + + computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param, + act_paths, eps_naik); + + /////////////////////////////////////////////////////// + // Allocate host storage for fields built on the GPU // + /////////////////////////////////////////////////////// + + // Paths for step 1: + vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links + wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links + + // Paths for step 2: + fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final fat ("X") links + longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links + + // Place to accumulate Naiks + if (n_naiks > 1) { + fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon fat links + longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks + } + + // QDP order fields + for (int i = 0; i < 4; i++) { + qdp_fatlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + if (n_naiks > 1) { + qdp_fatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } + } + + ///////////////////////////////////////////////////////// + // Free allocations that are only needed for CPU setup // + ///////////////////////////////////////////////////////// + + for (int i = 0; i < 4; i++) + host_free(qdp_sitelink[i]); + +#ifdef MULTI_GPU + exchange_llfat_cleanup(); +#endif + } + + static void end() { + freeGaugeQuda(); + } + + static void destroy() { + if (milc_sitelink) host_free(milc_sitelink); + + for (int i = 0; i < 4; i++) { + host_free(fat_reflink[i]); + host_free(long_reflink[i]); + if (n_naiks > 1) { + host_free(fat_reflink_eps[i]); + host_free(long_reflink_eps[i]); + } + } + + // Clean up GPU compute links + host_free(vlink); + host_free(wlink); + host_free(fatlink); + host_free(longlink); + + if (n_naiks > 1) { + host_free(fatlink_eps); + host_free(longlink_eps); + } + + for (int i = 0; i < 4; i++) { + host_free(qdp_fatlink[i]); + host_free(qdp_longlink[i]); + if (n_naiks > 1) { + host_free(qdp_fatlink_eps[i]); + host_free(qdp_longlink_eps[i]); + } + } + } + + /*--------------------------------------------------------------------*/ + // Some notation: + // U -- original link, SU(3), copied to "field" from "site" + // V -- after 1st level of smearing, non-SU(3) + // W -- unitarized, SU(3) + // X -- after 2nd level of smearing, non-SU(3) + /*--------------------------------------------------------------------*/ + + double llfatCUDA(int niter) { + host_timer_t host_timer; + + comm_barrier(); + host_timer.start(); + + for (int i = 0; i < niter; i++) { + // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! + + // Create V links (fat7 links) and W links (unitarized V links), 1st path table set + computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &gauge_param); + + if (n_naiks > 1) { + // Create Naiks, 3rd path table set + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param); + + // Rescale+copy Naiks into Naik field + cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); + cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size); + } else { + memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); + memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); + } + + // Create X and long links, 2nd path table set + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &gauge_param); + + if (n_naiks > 1) { + // Add into Naik field + cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size); + cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size); + } + } + + host_timer.stop(); + + return host_timer.last(); + } + + void run_test(int niter, bool print_metrics = false) { + ////////////////////// + // Perform GPU test // + ////////////////////// + + printfQuda("Tuning...\n"); + llfatCUDA(1); + + auto flops0 = quda::Tunable::flops_global(); + auto bytes0 = quda::Tunable::bytes_global(); + + printfQuda("Running %d iterations of computation\n", niter); + double secs = llfatCUDA(niter); + + unsigned long long flops = (quda::Tunable::flops_global() - flops0); + unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0); + + if (print_metrics) { + // FIXME: does not include unitarization, extra naiks + int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3]; + //long long flops = 61632 * (long long)niter; // Constructing V field + // Constructing W field? + // Constructing separate Naiks + //flops += 61632 * (long long)niter; // Constructing X field + //flops += (252 * 4) * (long long)niter; // long-link contribution + + printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter); + + printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter, + (flops / niter) / volume, (bytes / niter) / volume); + + double gflops = 1.0e-9 * flops / secs; + printfQuda("GFLOPS = %f\n", gflops); + + double gbytes = 1.0e-9 * bytes / secs; + printfQuda("GBYTES = %f\n", gbytes); + + // Old metric + //double perf = flops / (secs * 1024 * 1024 * 1024); + //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf); + } + } + + std::array verify() + { + //////////////////////////////////////////////////////////////////// + // Layout change for fatlink, fatlink_eps, longlink, longlink_eps // + //////////////////////////////////////////////////////////////////// + + reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + + if (n_naiks > 1) { + reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + } + + ////////////////////////////// + // Perform the verification // + ////////////////////////////// + + std::array res = {0., 0.}; + + // extra factor of 10 b/c the norm isn't normalized + double max_dev = 10. * getTolerance(prec); + + // Non-zero epsilon check + if (n_naiks > 1) { + for (int dir = 0; dir < 4; dir++) { + res[0] = std::max(res[0], + compare_floats_v2(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, max_dev, + gauge_param.cpu_prec)); + } + + strong_check_link(qdp_fatlink_eps, "Fat link GPU results: ", fat_reflink_eps, "CPU reference results:", V, + gauge_param.cpu_prec); + + for (int dir = 0; dir < 4; ++dir) { + res[1] = std::max(res[1], + compare_floats_v2(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, max_dev, + gauge_param.cpu_prec)); + } + + strong_check_link(qdp_longlink_eps, "Long link GPU results: ", long_reflink_eps, "CPU reference results:", V, + gauge_param.cpu_prec); + } else { + for (int dir = 0; dir < 4; dir++) { + res[0] = std::max(res[0], + compare_floats_v2(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec)); + } + + strong_check_link(qdp_fatlink, "Fat link GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec); + + for (int dir = 0; dir < 4; ++dir) { + res[1] = std::max(res[1], + compare_floats_v2(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec)); + } + + strong_check_link(qdp_longlink, "Long link GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec); + } + + printfQuda("Fat link test %s\n", (res[0] < max_dev) ? "PASSED" : "FAILED"); + printfQuda("Long link test %s\n", (res[1] < max_dev) ? "PASSED" : "FAILED"); + + return res; + + } +}; From eff6773549dbe8f96929edd4e2426883eb7cc54f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 30 Nov 2023 12:09:34 -0800 Subject: [PATCH 12/53] Created a working hisq_stencil_ctest, woohoo! --- tests/CMakeLists.txt | 14 ++- tests/hisq_stencil_ctest.cpp | 179 ++++++++++++++++++++++++++++++++ tests/hisq_stencil_test.cpp | 7 +- tests/hisq_stencil_test_utils.h | 151 +++++++++++++++++---------- 4 files changed, 295 insertions(+), 56 deletions(-) create mode 100644 tests/hisq_stencil_ctest.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3a95e355e8..135bbd90be 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -230,6 +230,11 @@ if(QUDA_DIRAC_STAGGERED) quda_checkbuildtest(hisq_stencil_test QUDA_BUILD_ALL_TESTS) install(TARGETS hisq_stencil_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR}) + add_executable(hisq_stencil_ctest hisq_stencil_ctest.cpp) + target_link_libraries(hisq_stencil_ctest ${TEST_LIBS}) + quda_checkbuildtest(hisq_stencil_ctest QUDA_BUILD_ALL_TESTS) + install(TARGETS hisq_stencil_ctest ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR}) + add_executable(hisq_paths_force_test hisq_paths_force_test.cpp) target_link_libraries(hisq_paths_force_test ${TEST_LIBS}) quda_checkbuildtest(hisq_paths_force_test QUDA_BUILD_ALL_TESTS) @@ -1289,7 +1294,14 @@ foreach(prec IN LISTS TEST_PRECS) --gtest_output=xml:eigensolve_test_mobius_eofa_asym_${prec}.xml) endif() endforeach(prec) - + +if(QUDA_DIRAC_STAGGERED) + add_test(NAME hisq_stencil + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dim 8 8 8 8 + --gtest_output=xml:hisq_stencil_test.xml) +endif() + foreach(prec IN LISTS TEST_PRECS) add_test(NAME gauge_path_${prec} diff --git a/tests/hisq_stencil_ctest.cpp b/tests/hisq_stencil_ctest.cpp new file mode 100644 index 0000000000..55186c6a5f --- /dev/null +++ b/tests/hisq_stencil_ctest.cpp @@ -0,0 +1,179 @@ +#include "hisq_stencil_test_utils.h" + +using namespace quda; + +bool ctest_all_partitions = false; + +using ::testing::Bool; +using ::testing::Combine; +using ::testing::Range; +using ::testing::TestWithParam; +using ::testing::Values; + +class HisqStencilTest : public ::testing::TestWithParam<::testing::tuple> +{ +protected: + ::testing::tuple param; + + HisqStencilTestWrapper hisq_stencil_test_wrapper; + + bool skip() + { + QudaPrecision precision = static_cast(::testing::get<0>(GetParam())); + QudaReconstructType recon = static_cast(::testing::get<1>(GetParam())); + + if ((QUDA_PRECISION & precision) == 0 + || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) + return true; + + const std::array partition_enabled {true, true, true, false, true, false, false, false, + true, false, false, false, true, false, true, true}; + if (!ctest_all_partitions && !partition_enabled[::testing::get<3>(GetParam())]) return true; + + return false; + } + + void display_test_info(QudaPrecision prec, QudaReconstructType link_recon, bool has_naik) { + printfQuda("running the following test:\n"); + printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n"); + printfQuda("%s %s %d/%d/%d/ %d %s \n", + get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order)); + printfQuda("Grid partition info: X Y Z T\n"); + printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), + dimPartitioned(3)); + printfQuda("Number of Naiks: %d\n", has_naik ? 2 : 1); + } + +public: + virtual void SetUp() { + QudaPrecision prec = static_cast(::testing::get<0>(GetParam())); + QudaReconstructType recon = static_cast(::testing::get<1>(GetParam())); + bool has_naik = ::testing::get<2>(GetParam()); + + if (skip()) GTEST_SKIP(); + + int partition = ::testing::get<3>(GetParam()); + for (int j = 0; j < 4; j++) { + if (partition & (1 << j)) { commDimPartitionedSet(j); } + } + updateR(); + + hisq_stencil_test_wrapper.init_ctest(prec, recon, has_naik); + display_test_info(prec, recon, has_naik); + } + + virtual void TearDown() { + if (skip()) GTEST_SKIP(); + hisq_stencil_test_wrapper.end(); + } + + static void SetUpTestCase() { + initQuda(device_ordinal); + } + + // Per-test-case tear-down. + // Called after the last test in this test case. + // Can be omitted if not needed. + static void TearDownTestCase() + { + HisqStencilTestWrapper::destroy(); + endQuda(); + } +}; + +TEST_P(HisqStencilTest, benchmark) +{ + hisq_stencil_test_wrapper.run_test(niter, /**show_metrics =*/true); +} + +TEST_P(HisqStencilTest, verify) +{ + hisq_stencil_test_wrapper.run_test(2); + + std::array res = hisq_stencil_test_wrapper.verify(); + + // extra factor of 10 b/c the norm isn't normalized + double max_dev = 10. * getTolerance(prec); + + // fat link + EXPECT_LE(res[0], max_dev) << "Reference CPU and QUDA implementations of fat link do not agree"; + + // long link + EXPECT_LE(res[1], max_dev) << "Reference CPU and QUDA implementations of long link do not agree"; +} + +int main(int argc, char **argv) +{ + // initalize google test + ::testing::InitGoogleTest(&argc, argv); + + // for speed + xdim = ydim = zdim = tdim = 8; + + // default to 18 reconstruct + link_recon = QUDA_RECONSTRUCT_NO; + cpu_prec = prec = QUDA_DOUBLE_PRECISION; + + // Parse command line options + auto app = make_app(); + app->add_option("--all-partitions", ctest_all_partitions, "Test all instead of reduced combination of partitions"); + try { + app->parse(argc, argv); + } catch (const CLI::ParseError &e) { + return app->exit(e); + } + + if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION) + errorQuda("Precision %d is unsupported in some link fattening routines\n", prec); + + if (link_recon != QUDA_RECONSTRUCT_NO) + errorQuda("Reconstruct %d is unsupported in some link fattening routines\n", link_recon); + + if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order); + + if (eps_naik != 0.0) { n_naiks = 2; } + + setVerbosity(verbosity); + initComms(argc, argv, gridsize_from_cmdline); + + // Ensure gtest prints only from rank 0 + ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); + if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } + + int test_rc = RUN_ALL_TESTS(); + + finalizeComms(); + + return test_rc; +} + +std::string gethisqstenciltestname(testing::TestParamInfo<::testing::tuple> param) +{ + const QudaPrecision prec = static_cast(::testing::get<0>(param.param)); + const QudaReconstructType recon = static_cast(::testing::get<1>(param.param)); + const bool has_naik = ::testing::get<2>(param.param); + const int part = ::testing::get<3>(param.param); + std::stringstream ss; + // ss << get_dslash_str(dslash_type) << "_"; + ss << get_prec_str(prec); + ss << "_r" << recon; + if (has_naik) ss << "_naik"; + ss << "_partition" << part; + return ss.str(); +} + +#ifdef MULTI_GPU +INSTANTIATE_TEST_SUITE_P(QUDA, HisqStencilTest, + Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), + ::testing::Values(QUDA_RECONSTRUCT_NO), + ::testing::Bool(), + Range(0, 16)), + gethisqstenciltestname); +#else +INSTANTIATE_TEST_SUITE_P(QUDA, HisqStencilTest, + Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), + ::testing::Values(QUDA_RECONSTRUCT_NO), + ::testing::Bool(), + ::testing::Values(0)), + gethisqstenciltestname); +#endif diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp index 138bba9eea..e42dc7b804 100644 --- a/tests/hisq_stencil_test.cpp +++ b/tests/hisq_stencil_test.cpp @@ -59,10 +59,10 @@ TEST_F(HisqStencilTest, verify) double max_dev = 10. * getTolerance(prec); // fat link - EXPECT_LE(res[0], max_dev); + EXPECT_LE(res[0], max_dev) << "Reference CPU and QUDA implementations of fat link do not agree"; // long link - EXPECT_LE(res[1], max_dev); + EXPECT_LE(res[1], max_dev) << "Reference CPU and QUDA implementations of long link do not agree"; } int main(int argc, char **argv) @@ -88,6 +88,9 @@ int main(int argc, char **argv) if (prec == QUDA_HALF_PRECISION || prec == QUDA_QUARTER_PRECISION) errorQuda("Precision %d is unsupported in some link fattening routines\n", prec); + if (link_recon != QUDA_RECONSTRUCT_NO) + errorQuda("Reconstruct %d is unsupported in some link fattening routines\n", link_recon); + if (gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", gauge_order); if (eps_naik != 0.0) { n_naiks = 2; } diff --git a/tests/hisq_stencil_test_utils.h b/tests/hisq_stencil_test_utils.h index b4f7512c12..4c1f3cfcc3 100644 --- a/tests/hisq_stencil_test_utils.h +++ b/tests/hisq_stencil_test_utils.h @@ -63,23 +63,54 @@ struct HisqStencilTestWrapper { static inline void *fatlink_eps = nullptr; static inline void *longlink_eps = nullptr; + static inline void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr}; static inline void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - void init_test() { - cpu_prec = prec; - host_gauge_data_type_size = cpu_prec; + void set_naik(bool has_naik) { + if (has_naik) { + eps_naik = -0.03; // semi-arbitrary + n_naiks = 2; + } else { + eps_naik = 0.0; + n_naiks = 1; + } + } + + void init_ctest(QudaPrecision prec_, QudaReconstructType link_recon_, bool has_naik) { + prec = prec_; + link_recon = link_recon_; + + set_naik(has_naik); gauge_param = newQudaGaugeParam(); setStaggeredGaugeParam(gauge_param); + gauge_param.cuda_prec = prec; + static bool first_time = true; if (first_time) { + // force the Naik build up front, it doesn't effect the non-naik fields + set_naik(true); init_host(); + set_naik(has_naik); first_time = false; } + init(); + } + + void init_test() { + gauge_param = newQudaGaugeParam(); + setStaggeredGaugeParam(gauge_param); + + static bool first_time = true; + if (first_time) { + init_host(); + first_time = false; + } + init(); } void init_host() { @@ -140,14 +171,10 @@ struct HisqStencilTestWrapper { // Input links // ///////////////// - void *qdp_sitelink[4] = {nullptr, nullptr, nullptr, nullptr}; for (int i = 0; i < 4; i++) qdp_sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size); - milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - // Note: this could be replaced with loading a gauge field createSiteLinkCPU(qdp_sitelink, gauge_param.cpu_prec, 0); // 0 -> no phases - reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); /////////////////////// // Perform CPU Build // @@ -168,23 +195,9 @@ struct HisqStencilTestWrapper { computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, qdp_sitelink, &gauge_param, act_paths, eps_naik); - /////////////////////////////////////////////////////// - // Allocate host storage for fields built on the GPU // - /////////////////////////////////////////////////////// - - // Paths for step 1: - vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links - wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links - - // Paths for step 2: - fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final fat ("X") links - longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links - - // Place to accumulate Naiks - if (n_naiks > 1) { - fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon fat links - longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks - } + ///////////////////////////////////////////////////////////////////// + // Allocate CPU-precision host storage for fields built on the GPU // + ///////////////////////////////////////////////////////////////////// // QDP order fields for (int i = 0; i < 4; i++) { @@ -196,24 +209,60 @@ struct HisqStencilTestWrapper { } } - ///////////////////////////////////////////////////////// - // Free allocations that are only needed for CPU setup // - ///////////////////////////////////////////////////////// - - for (int i = 0; i < 4; i++) - host_free(qdp_sitelink[i]); - #ifdef MULTI_GPU exchange_llfat_cleanup(); #endif } + void init() { + + // reset the reconstruct in gauge param + gauge_param.reconstruct = link_recon; + + ///////////////////////////////////////////////////////////////// + // Create a CPU copy of the initial field in the GPU precision // + ///////////////////////////////////////////////////////////////// + + milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); + reorderQDPtoMILC(milc_sitelink, qdp_sitelink, V, gauge_site_size, gauge_param.cuda_prec, gauge_param.cpu_prec); + + /////////////////////////////////////////////////////// + // Allocate host storage for fields built on the GPU // + /////////////////////////////////////////////////////// + + // Paths for step 1: + vlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // V links + wlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // W links + + // Paths for step 2: + fatlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // final fat ("X") links + longlink = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // final long links + + // Place to accumulate Naiks + if (n_naiks > 1) { + fatlink_eps = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // epsilon fat links + longlink_eps = pinned_malloc(4 * V * gauge_site_size * gauge_param.cuda_prec); // epsilon long naiks + } + } + static void end() { + if (milc_sitelink) host_free(milc_sitelink); + + // Clean up GPU compute links + if (vlink) host_free(vlink); + if (wlink) host_free(wlink); + if (fatlink) host_free(fatlink); + if (longlink) host_free(longlink); + + if (n_naiks > 1) { + if (fatlink_eps) host_free(fatlink_eps); + if (longlink_eps) host_free(longlink_eps); + } + freeGaugeQuda(); } static void destroy() { - if (milc_sitelink) host_free(milc_sitelink); for (int i = 0; i < 4; i++) { host_free(fat_reflink[i]); @@ -224,18 +273,8 @@ struct HisqStencilTestWrapper { } } - // Clean up GPU compute links - host_free(vlink); - host_free(wlink); - host_free(fatlink); - host_free(longlink); - - if (n_naiks > 1) { - host_free(fatlink_eps); - host_free(longlink_eps); - } - for (int i = 0; i < 4; i++) { + host_free(qdp_sitelink[i]); host_free(qdp_fatlink[i]); host_free(qdp_longlink[i]); if (n_naiks > 1) { @@ -259,6 +298,10 @@ struct HisqStencilTestWrapper { comm_barrier(); host_timer.start(); + // manually override precision of input fields + auto cpu_param_backup = gauge_param.cpu_prec; + gauge_param.cpu_prec = gauge_param.cuda_prec; + for (int i = 0; i < niter; i++) { // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! @@ -270,11 +313,11 @@ struct HisqStencilTestWrapper { computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &gauge_param); // Rescale+copy Naiks into Naik field - cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); - cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size); + cpu_axy(gauge_param.cuda_prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); + cpu_axy(gauge_param.cuda_prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size); } else { - memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); - memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size); + memset(fatlink, 0, V * 4 * gauge_site_size * gauge_param.cuda_prec); + memset(longlink, 0, V * 4 * gauge_site_size * gauge_param.cuda_prec); } // Create X and long links, 2nd path table set @@ -282,11 +325,13 @@ struct HisqStencilTestWrapper { if (n_naiks > 1) { // Add into Naik field - cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size); - cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size); + cpu_xpy(gauge_param.cuda_prec, fatlink, fatlink_eps, V * 4 * gauge_site_size); + cpu_xpy(gauge_param.cuda_prec, longlink, longlink_eps, V * 4 * gauge_site_size); } } + gauge_param.cpu_prec = cpu_param_backup; + host_timer.stop(); return host_timer.last(); @@ -341,12 +386,12 @@ struct HisqStencilTestWrapper { // Layout change for fatlink, fatlink_eps, longlink, longlink_eps // //////////////////////////////////////////////////////////////////// - reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_fatlink, fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec); + reorderMILCtoQDP(qdp_longlink, longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec); if (n_naiks > 1) { - reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + reorderMILCtoQDP(qdp_fatlink_eps, fatlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec); + reorderMILCtoQDP(qdp_longlink_eps, longlink_eps, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cuda_prec); } ////////////////////////////// From cb940215fe4d83972b49fc0d47ffce5846ab377a Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 30 Nov 2023 15:03:30 -0800 Subject: [PATCH 13/53] Some cleanup of staggered_invert_test, working towards a ctest --- tests/staggered_invert_test.cpp | 85 ++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 27 deletions(-) diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 67e96507c1..c039482c94 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -18,6 +18,14 @@ #include #include +QudaGaugeParam gauge_param; +QudaInvertParam inv_param; +QudaMultigridParam mg_param; +QudaInvertParam mg_inv_param; +QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL]; +QudaEigParam eig_param; +bool use_split_grid = false; + #define MAX(a, b) ((a) > (b) ? (a) : (b)) void display_test_info() @@ -102,33 +110,28 @@ void display_test_info() dimPartitioned(3)); } -void test(int, char**) -{ +GaugeField cpuFatQDP = {}; +GaugeField cpuLongQDP = {}; +GaugeField cpuFatMILC = {}; +GaugeField cpuLongMILC = {}; +void init() +{ // Set QUDA internal parameters - QudaGaugeParam gauge_param = newQudaGaugeParam(); - QudaInvertParam inv_param = newQudaInvertParam(); + gauge_param = newQudaGaugeParam(); setStaggeredGaugeParam(gauge_param); - if (!inv_multigrid) setStaggeredInvertParam(inv_param); - - QudaInvertParam mg_inv_param = newQudaInvertParam(); - QudaMultigridParam mg_param = newQudaMultigridParam(); - QudaEigParam mg_eig_param[mg_levels]; - // params related to split grid. - for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; - int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; - bool use_split_grid = num_sub_partition > 1; + inv_param = newQudaInvertParam(); + mg_inv_param = newQudaInvertParam(); + mg_param = newQudaMultigridParam(); + eig_param = newQudaEigParam(); if (inv_multigrid) { - // Set some default values for MG solve types setQudaMgSolveTypes(); - setStaggeredMGInvertParam(inv_param); // Set sub structures mg_param.invert_param = &mg_inv_param; - for (int i = 0; i < mg_levels; i++) { if (mg_eig[i]) { mg_eig_param[i] = newQudaEigParam(); @@ -138,10 +141,12 @@ void test(int, char**) mg_param.eig_param[i] = nullptr; } } + // Set MG setStaggeredMultigridParam(mg_param); + } else { + setStaggeredInvertParam(inv_param); } - QudaEigParam eig_param = newQudaEigParam(); if (inv_deflate) { setEigParam(eig_param); inv_param.eig_param = &eig_param; @@ -168,16 +173,16 @@ void test(int, char**) cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuParam.order = QUDA_QDP_GAUGE_ORDER; GaugeField cpuIn = GaugeField(cpuParam); - GaugeField cpuFatQDP = GaugeField(cpuParam); + cpuFatQDP = GaugeField(cpuParam); cpuParam.order = QUDA_MILC_GAUGE_ORDER; - GaugeField cpuFatMILC = GaugeField(cpuParam); + cpuFatMILC = GaugeField(cpuParam); cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS; cpuParam.nFace = 3; cpuParam.order = QUDA_QDP_GAUGE_ORDER; - GaugeField cpuLongQDP = GaugeField(cpuParam); + cpuLongQDP = GaugeField(cpuParam); cpuParam.order = QUDA_MILC_GAUGE_ORDER; - GaugeField cpuLongMILC = GaugeField(cpuParam); + cpuLongMILC = GaugeField(cpuParam); void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)}; void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)}; @@ -212,6 +217,14 @@ void test(int, char**) // Staggered Gauge construct END //----------------------------------------------------------------------------------- +} + +std::vector solve() +{ + // params related to split grid. + for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; + int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; + bool use_split_grid = num_sub_partition > 1; // Setup the multigrid preconditioner void *mg_preconditioner = nullptr; @@ -252,6 +265,8 @@ void test(int, char**) // QUDA invert test //---------------------------------------------------------------------------- + std::vector res(Nsrc); + if (multishift == 1) { if (!use_split_grid) { for (int k = 0; k < Nsrc; k++) { @@ -285,7 +300,7 @@ void test(int, char**) for (int k = 0; k < Nsrc; k++) { if (verify_results) - verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, inv_param, 0); + res[k] = verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, inv_param, 0); } } else if (multishift > 1) { if (use_split_grid) @@ -332,20 +347,32 @@ void test(int, char**) printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs, inv_param.gflops / inv_param.secs); + for (int i = 0; i < multishift; i++) { printfQuda("%dth solution: mass=%f, ", i, masses[i]); - verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i); + auto resid = verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i); + if (i == 0) res[k] = resid; } } } else { errorQuda("Invalid number of shifts %d", multishift); - } // switch + } + + // Free the multigrid solver + if (inv_multigrid) destroyMultigridQuda(mg_preconditioner); // Compute timings if (Nsrc > 1 && !use_split_grid) performanceStats(time, gflops, iter); - // Free the multigrid solver - if (inv_multigrid) destroyMultigridQuda(mg_preconditioner); + return res; +} + +void cleanup() +{ + cpuFatQDP = {}; + cpuLongQDP = {}; + cpuFatMILC = {}; + cpuLongMILC = {}; } int main(int argc, char **argv) @@ -392,7 +419,11 @@ int main(int argc, char **argv) initQuda(device_ordinal); - test(argc, argv); + init(); + + solve(); + + cleanup(); // Finalize the QUDA library endQuda(); From b3508beac5ae2be9c58d48fcc3879321faebf6f5 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 30 Nov 2023 20:12:49 -0800 Subject: [PATCH 14/53] Added a mostly working gtest! --- tests/host_reference/dslash_reference.cpp | 4 +- tests/host_reference/dslash_reference.h | 2 +- tests/staggered_invert_test.cpp | 67 +++++- tests/staggered_invert_test_gtest.hpp | 242 ++++++++++++++++++++++ 4 files changed, 303 insertions(+), 12 deletions(-) create mode 100644 tests/staggered_invert_test_gtest.hpp diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index a5685ed8fe..c2db9993f8 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -743,7 +743,7 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou return l2r; } -double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, +std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param, int shift) { @@ -798,5 +798,5 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi } } - return l2r; + return {l2r, inv_param.tol_hq}; } diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 8291dc688d..b17238bac4 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -109,7 +109,7 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu void *spinorCheck, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv); -double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, +std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param, int shift); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index c039482c94..dba2813916 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -26,7 +26,8 @@ QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL]; QudaEigParam eig_param; bool use_split_grid = false; -#define MAX(a, b) ((a) > (b) ? (a) : (b)) +// if --enable-testing true is passed, we run the tests defined in here +#include void display_test_info() { @@ -219,8 +220,30 @@ void init() //----------------------------------------------------------------------------------- } -std::vector solve() +std::vector> solve(test_t param) { + inv_param.inv_type = ::testing::get<0>(param); + inv_param.solution_type = ::testing::get<1>(param); + inv_param.solve_type = ::testing::get<2>(param); + inv_param.cuda_prec_sloppy = ::testing::get<3>(param); + multishift = ::testing::get<4>(param); + inv_param.solution_accumulator_pipeline = ::testing::get<5>(param); + + // schwarz parameters + auto schwarz_param = ::testing::get<6>(param); + inv_param.schwarz_type = ::testing::get<0>(schwarz_param); + inv_param.inv_type_precondition = ::testing::get<1>(schwarz_param); + inv_param.cuda_prec_precondition = ::testing::get<2>(schwarz_param); + + inv_param.residual_type = ::testing::get<7>(param); + + // reset lambda_max if we're doing a testing loop to ensure correct lambma_max + if (enable_testing) inv_param.ca_lambda_max = -1.0; + + logQuda(QUDA_SUMMARIZE, "Solution = %s, Solve = %s, Solver = %s, Sloppy precision = %s\n", + get_solution_str(inv_param.solution_type), get_solve_str(inv_param.solve_type), + get_solver_str(inv_param.inv_type), get_prec_str(inv_param.cuda_prec_sloppy)); + // params related to split grid. for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; @@ -265,7 +288,7 @@ std::vector solve() // QUDA invert test //---------------------------------------------------------------------------- - std::vector res(Nsrc); + std::vector> res(Nsrc); if (multishift == 1) { if (!use_split_grid) { @@ -351,7 +374,13 @@ std::vector solve() for (int i = 0; i < multishift; i++) { printfQuda("%dth solution: mass=%f, ", i, masses[i]); auto resid = verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i); - if (i == 0) res[k] = resid; + + // take the HQ residual from the lightest mass + if (i == 0) { + res[k] = resid; + } else { + if (resid[0] > res[k][0]) res[k][0] = resid[0]; + } } } } else { @@ -377,6 +406,7 @@ void cleanup() int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); setQudaStaggeredDefaultInvTestParams(); setQudaDefaultMgTestParams(); // Parse command line options @@ -385,7 +415,7 @@ int main(int argc, char **argv) add_deflation_option_group(app); add_multigrid_option_group(app); add_comms_option_group(app); - + add_testing_option_group(app); try { app->parse(argc, argv); } catch (const CLI::ParseError &e) { @@ -419,17 +449,36 @@ int main(int argc, char **argv) initQuda(device_ordinal); + // need force a well-behaved operator + reasonable convergence + if (enable_testing) { + compute_fatlong = true; + mass = 0.32; // yes, it's a magic number + tol = 1e-6; + tol_hq = 1e-6; + //niter = 500; // the staggered spectrum is rough + } + init(); - solve(); + int result = 0; + if (enable_testing) { // tests are defined in staggered_invert_test_gtest.hpp + ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); + if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } + if (dslash_type == QUDA_LAPLACE_DSLASH) + errorQuda("Staggered ctest doesn't support the Laplace operator (yet)"); + result = RUN_ALL_TESTS(); + } else { + solve(test_t {inv_type, solution_type, solve_type, prec_sloppy, multishift, solution_accumulator_pipeline, + schwarz_t {precon_schwarz_type, inv_multigrid ? QUDA_MG_INVERTER : precon_type, prec_precondition}, + inv_param.residual_type}); + } cleanup(); // Finalize the QUDA library + freeGaugeQuda(); endQuda(); - - // Finalize the communications layer finalizeComms(); - return 0; + return result; } diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp new file mode 100644 index 0000000000..4eee2b37ee --- /dev/null +++ b/tests/staggered_invert_test_gtest.hpp @@ -0,0 +1,242 @@ +#include +#include + +// tuple containing parameters for Schwarz solver +using schwarz_t = ::testing::tuple; + +using test_t + = ::testing::tuple; + +class StaggeredInvertTest : public ::testing::TestWithParam +{ +protected: + test_t param; + +public: + StaggeredInvertTest() : param(GetParam()) { } +}; + +bool is_normal_residual(QudaInverterType type) +{ + switch (type) { + case QUDA_CGNR_INVERTER: + case QUDA_CA_CGNR_INVERTER: return true; + default: return false; + } +} + +bool is_preconditioned_solve(QudaSolveType type) +{ + switch (type) { + case QUDA_DIRECT_PC_SOLVE: + case QUDA_NORMOP_PC_SOLVE: return true; + default: return false; + } +} + +bool is_full_solution(QudaSolutionType type) +{ + switch (type) { + case QUDA_MAT_SOLUTION: + case QUDA_MATDAG_MAT_SOLUTION: return true; + default: return false; + } +} + +bool is_normal_solve(test_t param) +{ + auto inv_type = ::testing::get<0>(param); + auto solve_type = ::testing::get<2>(param); + + switch (solve_type) { + case QUDA_NORMOP_SOLVE: + case QUDA_NORMOP_PC_SOLVE: return true; + default: + switch (inv_type) { + case QUDA_CGNR_INVERTER: + case QUDA_CGNE_INVERTER: + case QUDA_CA_CGNR_INVERTER: + case QUDA_CA_CGNE_INVERTER: return true; + default: return false; + } + } +} + +bool support_solution_accumulator_pipeline(QudaInverterType type) +{ + switch (type) { + case QUDA_CG_INVERTER: + case QUDA_CA_CG_INVERTER: + case QUDA_CGNR_INVERTER: + case QUDA_CGNE_INVERTER: + case QUDA_PCG_INVERTER: return true; + default: return false; + } +} + +bool skip_test(test_t param) +{ + auto inverter_type = ::testing::get<0>(param); + auto solution_type = ::testing::get<1>(param); + auto prec_sloppy = ::testing::get<3>(param); + auto multishift = ::testing::get<4>(param); + auto solution_accumulator_pipeline = ::testing::get<5>(param); + auto schwarz_param = ::testing::get<6>(param); + auto prec_precondition = ::testing::get<2>(schwarz_param); + + if (prec < prec_sloppy) return true; // outer precision >= sloppy precision + if (!(QUDA_PRECISION & prec_sloppy)) return true; // precision not enabled so skip it + if (!(QUDA_PRECISION & prec_precondition) && prec_precondition != QUDA_INVALID_PRECISION) + return true; // precision not enabled so skip it + if (prec_sloppy < prec_precondition) return true; // sloppy precision >= preconditioner precision + + // Skip if the inverter does not support batched update and batched update is greater than one + if (!support_solution_accumulator_pipeline(inverter_type) && solution_accumulator_pipeline > 1) return true; + // There's no MLocal or MdagMLocal support yet, this is left in for reference + //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) + // if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true; + + // split-grid doesn't support split-grid at present + if (use_split_grid && multishift > 1) return true; + + return false; +} + +std::vector> solve(test_t param); + +TEST_P(StaggeredInvertTest, verify) +{ + if (skip_test(GetParam())) GTEST_SKIP(); + + inv_param.tol = 0.0; + inv_param.tol_hq = 0.0; + auto res_t = ::testing::get<7>(GetParam()); + if (res_t & QUDA_L2_RELATIVE_RESIDUAL) inv_param.tol = tol; + if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq; + + inv_param.reliable_delta = reliable_delta; + + auto tol = inv_param.tol; + // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this + if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50; + // Slight loss of precision possible when reconstructing full solution + if (is_full_solution(::testing::get<1>(GetParam())) && is_preconditioned_solve(::testing::get<2>(GetParam()))) + tol *= 10; + + // Slight loss of precision seems to be possible in single precision + // with the asqtad operator, though it looks like it's because of the + // fat/long links going through a few precision conversions here and there + if (dslash_type == QUDA_ASQTAD_DSLASH && prec <= QUDA_SINGLE_PRECISION) + tol *= 1.1; + + for (auto rsd : solve(GetParam())) { + if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); } + if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); } + } +} + +std::string gettestname(::testing::TestParamInfo param) +{ + std::string name; + name += get_solver_str(::testing::get<0>(param.param)) + std::string("_"); + name += get_solution_str(::testing::get<1>(param.param)) + std::string("_"); + name += get_solve_str(::testing::get<2>(param.param)) + std::string("_"); + name += get_prec_str(::testing::get<3>(param.param)); + if (::testing::get<4>(param.param) > 1) + name += std::string("_shift") + std::to_string(::testing::get<4>(param.param)); + if (::testing::get<5>(param.param) > 1) + name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<5>(param.param)); + auto &schwarz_param = ::testing::get<6>(param.param); + if (::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) { + name += std::string("_") + get_schwarz_str(::testing::get<0>(schwarz_param)); + name += std::string("_") + get_solver_str(::testing::get<1>(schwarz_param)); + name += std::string("_") + get_prec_str(::testing::get<2>(schwarz_param)); + } + auto res_t = ::testing::get<7>(param.param); + if (res_t & QUDA_L2_RELATIVE_RESIDUAL) name += std::string("_l2"); + if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) name += std::string("_heavy_quark"); + return name; +} + +using ::testing::Combine; +using ::testing::Values; + +auto staggered_pc_solvers + = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER, QUDA_GCR_INVERTER, + QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); + +auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER); + +// The spectrum of the staggered operator means MR has a miserable time converging, +// it's not MR's fault. Other solvers have troubles too, which I need to think through. +//auto direct_solvers +// = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, +// QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); + +auto direct_solvers + = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_BICGSTABL_INVERTER); + +auto sloppy_precisions + = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION); + +auto solution_accumulator_pipelines = Values(1, 8); + +auto no_schwarz = Combine(Values(QUDA_INVALID_SCHWARZ), Values(QUDA_INVALID_INVERTER), Values(QUDA_INVALID_PRECISION)); + +auto no_heavy_quark = Values(QUDA_L2_RELATIVE_RESIDUAL); + +// the staggered PC op doesn't support "normal" operators since it's already +// Hermitian positive definite + +// preconditioned solves +INSTANTIATE_TEST_SUITE_P(EvenOdd, StaggeredInvertTest, + Combine(staggered_pc_solvers, Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION), + Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1), + solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + gettestname); + +// full system normal solve +INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredInvertTest, + Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE), + sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + gettestname); + +// full system direct solve +INSTANTIATE_TEST_SUITE_P(Full, StaggeredInvertTest, + Combine(direct_solvers, Values(QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_SOLVE), sloppy_precisions, + Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + gettestname); + +// preconditioned multi-shift solves +INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest, + Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), + Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(10), + solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + gettestname); + +// Schwarz-preconditioned normal solves +//INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest, +// Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION), +// Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1), +// solution_accumulator_pipelines, +// Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER), +// Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)), +// no_heavy_quark), +// gettestname); + +// Schwarz-preconditioned direct solves +//INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, StaggeredInvertTest, +// Combine(Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE), +// sloppy_precisions, Values(1), solution_accumulator_pipelines, +// Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_MR_INVERTER, QUDA_CA_GCR_INVERTER), +// Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)), +// no_heavy_quark), +// gettestname); + +// Heavy-Quark preconditioned solves +INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest, + Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), + Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1), + solution_accumulator_pipelines, no_schwarz, + Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)), + gettestname); From 5aa628fe0695c169151ea2e6527f4c53715f6648 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 30 Nov 2023 22:02:22 -0800 Subject: [PATCH 15/53] More fully pipecleaned staggered ctest; split grid testing outstanding --- tests/CMakeLists.txt | 45 ++++++++++++++++++- tests/staggered_invert_test.cpp | 28 +++++++++--- tests/staggered_invert_test_gtest.hpp | 64 ++++++++++++++++----------- 3 files changed, 104 insertions(+), 33 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 135bbd90be..3fea1f68b4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -953,7 +953,7 @@ elseif(single_prec) set(TEST_PRECS single) endif() -# Inversions +# Wilson-type Inversions foreach(prec IN LISTS TEST_PRECS) if(${prec} STREQUAL "double") @@ -1128,6 +1128,49 @@ foreach(prec IN LISTS TEST_PRECS) endif() endforeach(prec) +# Staggered-type Inversions +foreach(prec IN LISTS TEST_PRECS) + + # These require looser tolerances to keep iterations to solution in check + if(${prec} STREQUAL "double") + set(tol 1e-6) + elseif(${prec} STREQUAL "single") + set(tol 1e-5) + endif() + + if(QUDA_DIRAC_STAGGERED) + # --compute-fat-long true is necessary to get well-behaved fields + + add_test(NAME invert_test_staggered_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true + --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --enable-testing true + --gtest_output=xml:invert_test_staggered_${prec}.xml) + +# if(DEFINED ENV{QUDA_ENABLE_TUNING}) +# if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) +# add_test(NAME invert_test_splitgrid_wilson_${prec} +# COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} +# --dslash-type wilson --ngcrkrylov 8 +# --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 +# --nsrc ${QUDA_TEST_NUM_PROCS} +# --enable-testing true +# --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml) +# +# set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) +# endif() +# endif() + + add_test(NAME invert_test_asqtad_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true + --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --enable-testing true + --gtest_output=xml:invert_test_asqtad_${prec}.xml) + endif() +endforeach(prec) + # Eigensolves foreach(prec IN LISTS TEST_PRECS) diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index dba2813916..3299f0d84f 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -449,13 +449,29 @@ int main(int argc, char **argv) initQuda(device_ordinal); - // need force a well-behaved operator + reasonable convergence if (enable_testing) { - compute_fatlong = true; - mass = 0.32; // yes, it's a magic number - tol = 1e-6; - tol_hq = 1e-6; - //niter = 500; // the staggered spectrum is rough + // We need to force a well-behaved operator + reasonable convergence, otherwise + // the staggered tests will fail. These checks are designed to be consistent + // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked" + if (!compute_fatlong) { + warningQuda("compute_fatlong = %d , expected value %d , overriding", compute_fatlong, true); + compute_fatlong = true; + } + + double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-5 : 1e-6; + if (tol != expected_tol) { + warningQuda("tol = %e , expected value %e , overriding", tol, expected_tol); + tol = expected_tol; + } + if (tol_hq != expected_tol) { + warningQuda("tol_hq = %e , expected value %e , overriding", tol_hq, expected_tol); + tol_hq = 1e-5; + } + + if (niter != 1000) { + warningQuda("niter = %d , expected value %d , overriding", niter, 1000); + compute_fatlong = 1000; + } } init(); diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 4eee2b37ee..79c4de768d 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -77,7 +77,6 @@ bool support_solution_accumulator_pipeline(QudaInverterType type) bool skip_test(test_t param) { auto inverter_type = ::testing::get<0>(param); - auto solution_type = ::testing::get<1>(param); auto prec_sloppy = ::testing::get<3>(param); auto multishift = ::testing::get<4>(param); auto solution_accumulator_pipeline = ::testing::get<5>(param); @@ -114,25 +113,40 @@ TEST_P(StaggeredInvertTest, verify) if (res_t & QUDA_L2_RELATIVE_RESIDUAL) inv_param.tol = tol; if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq; - inv_param.reliable_delta = reliable_delta; + auto inverter_type = ::testing::get<0>(param); + auto solution_type = ::testing::get<1>(param); + auto solve_type = ::testing::get<2>(param); - auto tol = inv_param.tol; // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this - if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50; - // Slight loss of precision possible when reconstructing full solution - if (is_full_solution(::testing::get<1>(GetParam())) && is_preconditioned_solve(::testing::get<2>(GetParam()))) - tol *= 10; - - // Slight loss of precision seems to be possible in single precision - // with the asqtad operator, though it looks like it's because of the - // fat/long links going through a few precision conversions here and there - if (dslash_type == QUDA_ASQTAD_DSLASH && prec <= QUDA_SINGLE_PRECISION) + // The mass squared is a proxy for the condition number + if (is_normal_residual(inverter_type)) tol /= (0.25 * mass * mass); + + // To solve the direct operator to a given tolerance, grind the preconditioned + // operator to 0.5 * mass * tol... to keep the target tolerance in inv_param + // in check, we shift the requirement to the verified tolerance instead. + if (is_full_solution(solution_type) && is_preconditioned_solve(solve_type)) { + if (solve_type == QUDA_DIRECT_PC_SOLVE) + tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps + else if (solve_type == QUDA_NORMOP_PC_SOLVE) + tol /= (0.25 * mass * mass); // same as above, but squared as a proxy for the condition number + } + + // The power iterations method of determining the Chebyshev window + // breaks down due to the nature of the spectrum of the direct operator + auto ca_basis_tmp = inv_param.ca_basis; + if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER) + inv_param.ca_basis = QUDA_POWER_BASIS; + + // Slight loss of precision seems to be possible with the asqtad operator + if (dslash_type == QUDA_ASQTAD_DSLASH) tol *= 1.1; for (auto rsd : solve(GetParam())) { if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); } if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); } } + + inv_param.ca_basis = ca_basis_tmp; } std::string gettestname(::testing::TestParamInfo param) @@ -167,14 +181,9 @@ auto staggered_pc_solvers auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER); -// The spectrum of the staggered operator means MR has a miserable time converging, -// it's not MR's fault. Other solvers have troubles too, which I need to think through. -//auto direct_solvers -// = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, -// QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); - auto direct_solvers - = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_BICGSTABL_INVERTER); + = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, + QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); auto sloppy_precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION); @@ -214,6 +223,16 @@ INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest, solution_accumulator_pipelines, no_schwarz, no_heavy_quark), gettestname); +// Heavy-Quark preconditioned solves +INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest, + Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), + Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1), + solution_accumulator_pipelines, no_schwarz, + Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)), + gettestname); + +// These are left in but commented out for future reference + // Schwarz-preconditioned normal solves //INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest, // Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION), @@ -233,10 +252,3 @@ INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest, // no_heavy_quark), // gettestname); -// Heavy-Quark preconditioned solves -INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest, - Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), - Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1), - solution_accumulator_pipelines, no_schwarz, - Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)), - gettestname); From 550a5a91968f726e2e3b6bc48a8729d9a331d3cc Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 1 Dec 2023 10:48:10 -0800 Subject: [PATCH 16/53] Enabled split grid --- tests/CMakeLists.txt | 26 +++++++++++++------------- tests/invert_test_gtest.hpp | 2 +- tests/staggered_invert_test.cpp | 2 +- tests/staggered_invert_test_gtest.hpp | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3fea1f68b4..b26f49e529 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1148,19 +1148,19 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --gtest_output=xml:invert_test_staggered_${prec}.xml) -# if(DEFINED ENV{QUDA_ENABLE_TUNING}) -# if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) -# add_test(NAME invert_test_splitgrid_wilson_${prec} -# COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} -# --dslash-type wilson --ngcrkrylov 8 -# --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 -# --nsrc ${QUDA_TEST_NUM_PROCS} -# --enable-testing true -# --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml) -# -# set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) -# endif() -# endif() + if(DEFINED ENV{QUDA_ENABLE_TUNING}) + if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) + add_test(NAME invert_test_splitgrid_staggered_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true + --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml) + + set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) + endif() + endif() add_test(NAME invert_test_asqtad_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp index 74866dbcd7..27c9c873f1 100644 --- a/tests/invert_test_gtest.hpp +++ b/tests/invert_test_gtest.hpp @@ -117,7 +117,7 @@ bool skip_test(test_t param) return true; #endif } - // split-grid doesn't support split-grid at present + // split-grid doesn't support multishift at present if (use_split_grid && multishift > 1) return true; return false; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 3299f0d84f..4ec84ba49d 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -247,7 +247,7 @@ std::vector> solve(test_t param) // params related to split grid. for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; - bool use_split_grid = num_sub_partition > 1; + use_split_grid = num_sub_partition > 1; // Setup the multigrid preconditioner void *mg_preconditioner = nullptr; diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 79c4de768d..243e0ffdc9 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -95,7 +95,7 @@ bool skip_test(test_t param) //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) // if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true; - // split-grid doesn't support split-grid at present + // split-grid doesn't support multigrid at present if (use_split_grid && multishift > 1) return true; return false; From b2560b571779c45619ae8945e7644ba3f8af1b78 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 1 Dec 2023 11:11:28 -0800 Subject: [PATCH 17/53] Added info on how to run the old tests --- tests/staggered_invert_test.cpp | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 4ec84ba49d..fbd451adee 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -26,6 +26,9 @@ QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL]; QudaEigParam eig_param; bool use_split_grid = false; +// print instructions on how to run the old tests +bool print_legacy_info = false; + // if --enable-testing true is passed, we run the tests defined in here #include @@ -111,6 +114,18 @@ void display_test_info() dimPartitioned(3)); } +void display_legacy_info() +{ + printfQuda("Instructions for running legacy tests:\n"); + printfQuda("--test 0 -> --solve-type direct --solution-type mat --inv-type bicgstab\n"); + printfQuda("--test 1 -> --solve-type direct-pc --solution-type mat --inv-type cg --matpc even-even\n"); + printfQuda("--test 2 -> --solve-type direct-pc --solution-type mat --inv-type cg --matpc odd-odd\n"); + printfQuda("--test 3 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even\n"); + printfQuda("--test 4 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd\n"); + printfQuda("--test 5 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even --multishift 8\n"); + printfQuda("--test 6 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd --multishift 8\n"); +} + GaugeField cpuFatQDP = {}; GaugeField cpuLongQDP = {}; GaugeField cpuFatMILC = {}; @@ -416,6 +431,7 @@ int main(int argc, char **argv) add_multigrid_option_group(app); add_comms_option_group(app); add_testing_option_group(app); + app->add_option("--legacy-test-info", print_legacy_info, "Print info on how to reproduce the old '--test #' behavior with flags, then exit"); try { app->parse(argc, argv); } catch (const CLI::ParseError &e) { @@ -423,15 +439,20 @@ int main(int argc, char **argv) } setVerbosity(verbosity); - if (inv_deflate && inv_multigrid) - errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n"); - // Set values for precisions via the command line. setQudaPrecisions(); // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp) initComms(argc, argv, gridsize_from_cmdline); + if (print_legacy_info) { + display_legacy_info(); + errorQuda("Exiting..."); + } + + if (inv_deflate && inv_multigrid) + errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n"); + initRand(); // Only these fermions are supported in this file From e952379bcc1911a8910aae85a1f215762b41d1a7 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 1 Dec 2023 13:22:05 -0800 Subject: [PATCH 18/53] Added Laplace ctests, tweaked some tolerances, uncovered a BiCGStab issue --- tests/CMakeLists.txt | 22 +++++++++++++ tests/staggered_dslash_ctest.cpp | 11 +++++-- tests/staggered_dslash_test.cpp | 11 +++++-- tests/staggered_eigensolve_test.cpp | 11 +++++-- tests/staggered_invert_test.cpp | 13 +++++--- tests/staggered_invert_test_gtest.hpp | 45 +++++++++++++++++++++++---- tests/utils/host_utils.h | 7 +++++ 7 files changed, 104 insertions(+), 16 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b26f49e529..3955de2cb1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -928,6 +928,19 @@ endif() if(polenv) set_tests_properties(dslash_${DIRAC_NAME}_build_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2}) endif() + + if(QUDA_LAPLACE) + set(DIRAC_NAME laplace) + add_test(NAME dslash_${DIRAC_NAME}_mat_policy${pol2} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type ${DIRAC_NAME} + --test Mat + --dim 2 4 6 8 + --gtest_output=xml:dslash_${DIRAC_NAME}_mat_test_pol${pol2}.xml) + if(polenv) + set_tests_properties(dslash_${DIRAC_NAME}_mat_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2}) + endif() + endif() endif() if(QUDA_COVDEV) @@ -1168,6 +1181,15 @@ foreach(prec IN LISTS TEST_PRECS) --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --enable-testing true --gtest_output=xml:invert_test_asqtad_${prec}.xml) + + if (QUDA_LAPLACE) + add_test(NAME invert_test_laplace_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type laplace --ngcrkrylov 8 --compute-fat-long true + --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --enable-testing true + --gtest_output=xml:invert_test_laplace_${prec}.xml) + endif() endif() endforeach(prec) diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index f17dfa278c..28a6a48141 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -122,8 +122,15 @@ int main(int argc, char **argv) if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } // Only these fermions are supported in this file - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace_enabled) { + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } else { + if (dslash_type == QUDA_LAPLACE_DSLASH) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't // ask to build the fat/long links... it doesn't make sense. diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index 4ebed72ed8..2883d29b64 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -85,8 +85,15 @@ int main(int argc, char **argv) if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } // Only these fermions are supported in this file - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace_enabled) { + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } else { + if (dslash_type == QUDA_LAPLACE_DSLASH) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, // and don't ask to build the fat/long links... it doesn't make sense. diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index f0f2c3eec8..797dcb1311 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -79,8 +79,15 @@ int main(int argc, char **argv) setQudaPrecisions(); // Only these fermions are supported in this file - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace_enabled) { + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } else { + if (dslash_type == QUDA_LAPLACE_DSLASH) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } display_test_info(); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index fbd451adee..be0ef6fd6b 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -456,8 +456,15 @@ int main(int argc, char **argv) initRand(); // Only these fermions are supported in this file - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace_enabled) { + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } else { + if (dslash_type == QUDA_LAPLACE_DSLASH) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } // Need to add support for LAPLACE MG? if (inv_multigrid) { @@ -501,8 +508,6 @@ int main(int argc, char **argv) if (enable_testing) { // tests are defined in staggered_invert_test_gtest.hpp ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } - if (dslash_type == QUDA_LAPLACE_DSLASH) - errorQuda("Staggered ctest doesn't support the Laplace operator (yet)"); result = RUN_ALL_TESTS(); } else { solve(test_t {inv_type, solution_type, solve_type, prec_sloppy, multishift, solution_accumulator_pipeline, diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 243e0ffdc9..c5884c210a 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -16,6 +16,15 @@ class StaggeredInvertTest : public ::testing::TestWithParam StaggeredInvertTest() : param(GetParam()) { } }; +bool is_hermitian_solver(QudaInverterType type) +{ + switch(type) { + case QUDA_CG_INVERTER: + case QUDA_CA_CG_INVERTER: return true; + default: return false; + } +} + bool is_normal_residual(QudaInverterType type) { switch (type) { @@ -77,6 +86,8 @@ bool support_solution_accumulator_pipeline(QudaInverterType type) bool skip_test(test_t param) { auto inverter_type = ::testing::get<0>(param); + auto solution_type = ::testing::get<1>(param); + auto solve_type = ::testing::get<2>(param); auto prec_sloppy = ::testing::get<3>(param); auto multishift = ::testing::get<4>(param); auto solution_accumulator_pipeline = ::testing::get<5>(param); @@ -95,6 +106,19 @@ bool skip_test(test_t param) //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) // if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true; + if (dslash_type == QUDA_LAPLACE_DSLASH) { + if (multishift > 1) return true; // Laplace doesn't support multishift + if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE) return true; // Laplace only supports direct solves + } + + if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) { + // the staggered and asqtad operators aren't HPD + if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type)) return true; + + // MR struggles with the staggered and asqtad spectrum, it's not MR's fault + if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) return true; + } + // split-grid doesn't support multigrid at present if (use_split_grid && multishift > 1) return true; @@ -107,6 +131,8 @@ TEST_P(StaggeredInvertTest, verify) { if (skip_test(GetParam())) GTEST_SKIP(); + auto tol_backup = tol; + inv_param.tol = 0.0; inv_param.tol_hq = 0.0; auto res_t = ::testing::get<7>(GetParam()); @@ -124,11 +150,13 @@ TEST_P(StaggeredInvertTest, verify) // To solve the direct operator to a given tolerance, grind the preconditioned // operator to 0.5 * mass * tol... to keep the target tolerance in inv_param // in check, we shift the requirement to the verified tolerance instead. - if (is_full_solution(solution_type) && is_preconditioned_solve(solve_type)) { + if (solution_type == QUDA_MAT_SOLUTION) { if (solve_type == QUDA_DIRECT_PC_SOLVE) tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps - else if (solve_type == QUDA_NORMOP_PC_SOLVE) - tol /= (0.25 * mass * mass); // same as above, but squared as a proxy for the condition number + if (solve_type == QUDA_NORMOP_SOLVE) + tol /= (0.5 * mass); // a proxy for the condition number + } else if (solution_type == QUDA_MATDAG_MAT_SOLUTION) { + tol *= 1.05; // seems to need a bit of a bump } // The power iterations method of determining the Chebyshev window @@ -137,16 +165,21 @@ TEST_P(StaggeredInvertTest, verify) if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER) inv_param.ca_basis = QUDA_POWER_BASIS; - // Slight loss of precision seems to be possible with the asqtad operator - if (dslash_type == QUDA_ASQTAD_DSLASH) + // FIXME: there's an issue in mixed precision BiCGStab I need to squash. + if (inverter_type == QUDA_BICGSTAB_INVERTER) tol *= 1.1; + // CGNE needs a bit of a bump + if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER) + tol *= 1.05; + for (auto rsd : solve(GetParam())) { if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); } if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); } } inv_param.ca_basis = ca_basis_tmp; + tol = tol_backup; } std::string gettestname(::testing::TestParamInfo param) @@ -182,7 +215,7 @@ auto staggered_pc_solvers auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER); auto direct_solvers - = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, + = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); auto sloppy_precisions diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 6dfdcfd856..f5276e26f1 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -40,6 +40,13 @@ extern QudaPrecision &cuda_prec_eigensolver; extern QudaPrecision &cuda_prec_refinement_sloppy; extern QudaPrecision &cuda_prec_ritz; +// Determine if the Laplace operator has been defined +#ifdef QUDA_LAPLACE +constexpr bool is_laplace_enabled = true; +#else +constexpr bool is_laplace_enabled = false; +#endif + // Set some basic parameters via command line or use defaults // Implemented in set_params.cpp void setQudaStaggeredDefaultInvTestParams(); From afb41b609b3746d9f1dd1df11f65f2b27290e50a Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 1 Dec 2023 13:27:15 -0800 Subject: [PATCH 19/53] Quality of life BiCGStab readability changes --- lib/inv_bicgstab_quda.cpp | 117 +++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 64 deletions(-) diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 4fdf08020a..ecb0352095 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -76,7 +76,7 @@ namespace quda { ColorSpinorField *x_sloppy, *r_sloppy, *r_0; double b2 = blas::norm2(b); // norm sq of source - double r2; // norm sq of residual + double r2; // norm sq of residual if (param.deflate) { // Construct the eigensolver and deflation space if requested. @@ -134,7 +134,7 @@ namespace quda { x = b; param.true_res = 0.0; param.true_res_hq = 0.0; - profile.TPSTOP(QUDA_PROFILE_PREAMBLE); + profile.TPSTOP(QUDA_PROFILE_PREAMBLE); return; } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { b2 = r2; @@ -147,12 +147,9 @@ namespace quda { if (param.precision_sloppy == x.Precision()) { r_sloppy = &r; - if(param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) - { + if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO){ r_0 = &b; - } - else - { + } else { ColorSpinorParam csParam(r); csParam.create = QUDA_ZERO_FIELD_CREATE; r_0 = new ColorSpinorField(csParam); // remember to delete this pointer. @@ -168,13 +165,10 @@ namespace quda { *r_0 = r; } - if (param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator) - { + if (param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator) { x_sloppy = &x; blas::zero(*x_sloppy); - } - else - { + } else { ColorSpinorParam csParam(x); csParam.create = QUDA_ZERO_FIELD_CREATE; csParam.setPrecision(param.precision_sloppy); @@ -222,20 +216,19 @@ namespace quda { if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n", - blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p), - blas::norm2(tmp), blas::norm2(r0), blas::norm2(t)); + blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p), + blas::norm2(tmp), blas::norm2(r0), blas::norm2(t)); - while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && - k < param.maxiter) { + while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) { matSloppy(v, p); Complex r0v; if (param.pipeline) { - r0v = blas::cDotProduct(r0, v); - if (k>0) rho = blas::cDotProduct(r0, r); + r0v = blas::cDotProduct(r0, v); + if (k>0) rho = blas::cDotProduct(r0, r); } else { - r0v = blas::cDotProduct(r0, v); + r0v = blas::cDotProduct(r0, v); } if (abs(rho) == 0.0) alpha = 0.0; else alpha = rho / r0v; @@ -247,38 +240,37 @@ namespace quda { int updateR = 0; if (param.pipeline) { - // omega = (t, r) / (t, t) - omega_t2 = blas::cDotProductNormA(t, rSloppy); - Complex tr = Complex(omega_t2.x, omega_t2.y); - double t2 = omega_t2.z; - omega = tr / t2; - double s2 = blas::norm2(rSloppy); - Complex r0t = blas::cDotProduct(r0, t); - beta = -r0t / r0v; - r2 = s2 - real(omega * conj(tr)) ; - - // now we can work out if we need to do a reliable update + // omega = (t, r) / (t, t) + omega_t2 = blas::cDotProductNormA(t, rSloppy); + Complex tr = Complex(omega_t2.x, omega_t2.y); + double t2 = omega_t2.z; + omega = tr / t2; + double s2 = blas::norm2(rSloppy); + Complex r0t = blas::cDotProduct(r0, t); + beta = -r0t / r0v; + r2 = s2 - real(omega * conj(tr)) ; + // now we can work out if we need to do a reliable update updateR = reliable(rNorm, maxrx, maxrr, r2, delta); } else { - // omega = (t, r) / (t, t) - omega_t2 = blas::cDotProductNormA(t, rSloppy); - omega = Complex(omega_t2.x / omega_t2.z, omega_t2.y / omega_t2.z); + // omega = (t, r) / (t, t) + omega_t2 = blas::cDotProductNormA(t, rSloppy); + omega = Complex(omega_t2.x / omega_t2.z, omega_t2.y / omega_t2.z); } if (param.pipeline && !updateR) { - //x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p - blas::caxpbypzYmbw(alpha, p, omega, rSloppy, xSloppy, t); - blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p); - //tripleBiCGstabUpdate(alpha, p, omega, rSloppy, xSloppy, t, -beta*omega, v, beta, p + //x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p + blas::caxpbypzYmbw(alpha, p, omega, rSloppy, xSloppy, t); + blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p); + //tripleBiCGstabUpdate(alpha, p, omega, rSloppy, xSloppy, t, -beta*omega, v, beta, p } else { - //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) - rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, rSloppy, xSloppy, t, r0); - rho0 = rho; - rho = Complex(rho_r2.x, rho_r2.y); - r2 = rho_r2.z; + //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) + rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, rSloppy, xSloppy, t, r0); + rho0 = rho; + rho = Complex(rho_r2.x, rho_r2.y); + r2 = rho_r2.z; } - if (use_heavy_quark_res && k%heavy_quark_check==0) { + if (use_heavy_quark_res && k % heavy_quark_check==0) { if (&x != &xSloppy) { blas::copy(tmp,y); heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy).z); @@ -291,9 +283,9 @@ namespace quda { if (!param.pipeline) updateR = reliable(rNorm, maxrx, maxrr, r2, delta); if (updateR) { - if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy); + if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy); - blas::xpy(x, y); // swap these around? + blas::xpy(x, y); // swap these around? mat(r, y); r2 = blas::xmyNorm(b, r); @@ -307,31 +299,30 @@ namespace quda { r2 = blas::xmyNorm(b, r); } - if (x.Precision() != rSloppy.Precision()) blas::copy(rSloppy, r); - blas::zero(xSloppy); + if (x.Precision() != rSloppy.Precision()) blas::copy(rSloppy, r); + blas::zero(xSloppy); - rNorm = sqrt(r2); - maxrr = rNorm; - maxrx = rNorm; - //r0Norm = rNorm; - rUpdate++; + rNorm = sqrt(r2); + maxrr = rNorm; + maxrx = rNorm; + //r0Norm = rNorm; + rUpdate++; } k++; PrintStats("BiCGstab", k, r2, b2, heavy_quark_res); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) - printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n", - blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p), - blas::norm2(tmp), blas::norm2(r0), blas::norm2(t)); + printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n", + blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p), + blas::norm2(tmp), blas::norm2(r0), blas::norm2(t)); // update p - if (!param.pipeline || updateR) {// need to update if not pipeline or did a reliable update - if (abs(rho*alpha) == 0.0) beta = 0.0; - else beta = (rho/rho0) * (alpha/omega); - blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p); + if (!param.pipeline || updateR) { // need to update if not pipeline or did a reliable update + if (abs(rho*alpha) == 0.0) beta = 0.0; + else beta = (rho/rho0) * (alpha/omega); + blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p); } - } if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy); @@ -342,7 +333,7 @@ namespace quda { param.iter += k; - if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); + if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("BiCGstab: Reliable updates = %d\n", rUpdate); @@ -361,9 +352,7 @@ namespace quda { if (param.precision_sloppy != x.Precision()) { delete r_0; delete r_sloppy; - } - else if(param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) - { + } else if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) { delete r_0; } From 15eeb821ddc4c1d03d559a5c3d5bee9cea2c7a05 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 4 Dec 2023 14:47:08 -0800 Subject: [PATCH 20/53] Strong BiCGStab cleanup, still need to reconcile a host verification headache --- include/invert_quda.h | 11 +- lib/inv_bicgstab_quda.cpp | 181 ++++++++++++++------------ tests/staggered_invert_test_gtest.hpp | 9 +- 3 files changed, 109 insertions(+), 92 deletions(-) diff --git a/include/invert_quda.h b/include/invert_quda.h index 11ac64708e..7cf26a6f4f 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -1048,8 +1048,15 @@ namespace quda { private: const DiracMdagM matMdagM; // used by the eigensolver - // pointers to fields to avoid multiple creation overhead - ColorSpinorField *yp, *rp, *pp, *vp, *tmpp, *tp; + + ColorSpinorField y; // Full precision solution accumulator + ColorSpinorField r; // Full precision residual vector + ColorSpinorField p; // Sloppy precision search direction + ColorSpinorField v; // Sloppy precision A * p + ColorSpinorField t; // Sloppy precision vector used for minres step + ColorSpinorField r0; // Bi-orthogonalization vector + ColorSpinorField r_sloppy; // Slopy precision residual vector + ColorSpinorField x_sloppy; // Sloppy solution accumulator vector bool init = false; public: diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index ecb0352095..10ec609ec3 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -21,15 +21,6 @@ namespace quda { BiCGstab::~BiCGstab() { profile.TPSTART(QUDA_PROFILE_FREE); - - if(init) { - delete yp; - delete rp; - delete pp; - delete vp; - delete tmpp; - delete tp; - } destroyDeflationSpace(); profile.TPSTOP(QUDA_PROFILE_FREE); } @@ -55,26 +46,16 @@ namespace quda { if (!init) { ColorSpinorParam csParam(x); csParam.create = QUDA_ZERO_FIELD_CREATE; - yp = new ColorSpinorField(csParam); - rp = new ColorSpinorField(csParam); + y = ColorSpinorField(csParam); + r = ColorSpinorField(csParam); csParam.setPrecision(param.precision_sloppy); - pp = new ColorSpinorField(csParam); - vp = new ColorSpinorField(csParam); - tmpp = new ColorSpinorField(csParam); - tp = new ColorSpinorField(csParam); + p = ColorSpinorField(csParam); + v = ColorSpinorField(csParam); + t = ColorSpinorField(csParam); init = true; } - ColorSpinorField &y = *yp; - ColorSpinorField &r = *rp; - ColorSpinorField &p = *pp; - ColorSpinorField &v = *vp; - ColorSpinorField &tmp = *tmpp; - ColorSpinorField &t = *tp; - - ColorSpinorField *x_sloppy, *r_sloppy, *r_0; - double b2 = blas::norm2(b); // norm sq of source double r2; // norm sq of residual @@ -145,41 +126,36 @@ namespace quda { // set field aliasing according to whether we are doing mixed precision or not if (param.precision_sloppy == x.Precision()) { - r_sloppy = &r; + r_sloppy = r.create_alias(); - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO){ - r_0 = &b; + if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + r0 = b.create_alias(); } else { ColorSpinorParam csParam(r); - csParam.create = QUDA_ZERO_FIELD_CREATE; - r_0 = new ColorSpinorField(csParam); // remember to delete this pointer. - *r_0 = r; + csParam.create = QUDA_NULL_FIELD_CREATE; + r0 = ColorSpinorField(csParam); + blas::copy(r0, r); } } else { ColorSpinorParam csParam(x); csParam.setPrecision(param.precision_sloppy); csParam.create = QUDA_NULL_FIELD_CREATE; - r_sloppy = new ColorSpinorField(csParam); - *r_sloppy = r; - r_0 = new ColorSpinorField(csParam); - *r_0 = r; + r_sloppy = ColorSpinorField(csParam); + blas::copy(r_sloppy, r); + r0 = ColorSpinorField(csParam); + blas::copy(r0, r); } if (param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator) { - x_sloppy = &x; - blas::zero(*x_sloppy); + x_sloppy = x.create_alias(); + blas::zero(x_sloppy); } else { ColorSpinorParam csParam(x); csParam.create = QUDA_ZERO_FIELD_CREATE; csParam.setPrecision(param.precision_sloppy); - x_sloppy = new ColorSpinorField(csParam); + x_sloppy = ColorSpinorField(csParam); } - // Syntatic sugar - ColorSpinorField &rSloppy = *r_sloppy; - ColorSpinorField &xSloppy = *x_sloppy; - ColorSpinorField &r0 = *r_0; - double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver const bool use_heavy_quark_res = @@ -212,21 +188,27 @@ namespace quda { profile.TPSTART(QUDA_PROFILE_COMPUTE); rho = r2; // cDotProductCuda(r0, r_sloppy); // BiCRstab - blas::copy(p, rSloppy); + blas::copy(p, r_sloppy); + + bool converged = convergence(r2, heavy_quark_res, stop, param.tol_hq); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) - printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n", - blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p), - blas::norm2(tmp), blas::norm2(r0), blas::norm2(t)); + printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", + blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p), + blas::norm2(r0), blas::norm2(t)); - while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) { + // track if we just performed an exact recalculation of y, r, r2 + bool just_updated = false; + + while ( !converged && k < param.maxiter) { + just_updated = false; matSloppy(v, p); Complex r0v; if (param.pipeline) { r0v = blas::cDotProduct(r0, v); - if (k>0) rho = blas::cDotProduct(r0, r); + if (k > 0) rho = blas::cDotProduct(r0, r); } else { r0v = blas::cDotProduct(r0, v); } @@ -234,18 +216,18 @@ namespace quda { else alpha = rho / r0v; // r -= alpha*v - blas::caxpy(-alpha, v, rSloppy); + blas::caxpy(-alpha, v, r_sloppy); - matSloppy(t, rSloppy); + matSloppy(t, r_sloppy); int updateR = 0; if (param.pipeline) { // omega = (t, r) / (t, t) - omega_t2 = blas::cDotProductNormA(t, rSloppy); + omega_t2 = blas::cDotProductNormA(t, r_sloppy); Complex tr = Complex(omega_t2.x, omega_t2.y); double t2 = omega_t2.z; omega = tr / t2; - double s2 = blas::norm2(rSloppy); + double s2 = blas::norm2(r_sloppy); Complex r0t = blas::cDotProduct(r0, t); beta = -r0t / r0v; r2 = s2 - real(omega * conj(tr)) ; @@ -253,29 +235,28 @@ namespace quda { updateR = reliable(rNorm, maxrx, maxrr, r2, delta); } else { // omega = (t, r) / (t, t) - omega_t2 = blas::cDotProductNormA(t, rSloppy); + omega_t2 = blas::cDotProductNormA(t, r_sloppy); omega = Complex(omega_t2.x / omega_t2.z, omega_t2.y / omega_t2.z); } if (param.pipeline && !updateR) { //x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p - blas::caxpbypzYmbw(alpha, p, omega, rSloppy, xSloppy, t); - blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p); - //tripleBiCGstabUpdate(alpha, p, omega, rSloppy, xSloppy, t, -beta*omega, v, beta, p + blas::caxpbypzYmbw(alpha, p, omega, r_sloppy, x_sloppy, t); + blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p); + //tripleBiCGstabUpdate(alpha, p, omega, r_sloppy, x_sloppy, t, -beta*omega, v, beta, p } else { //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) - rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, rSloppy, xSloppy, t, r0); + rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, r_sloppy, x_sloppy, t, r0); rho0 = rho; rho = Complex(rho_r2.x, rho_r2.y); r2 = rho_r2.z; } if (use_heavy_quark_res && k % heavy_quark_check==0) { - if (&x != &xSloppy) { - blas::copy(tmp,y); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy).z); + if (&x != &x_sloppy) { + heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x_sloppy, r_sloppy).z); } else { - blas::copy(r, rSloppy); + blas::copy(r, r_sloppy); heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z); } } @@ -283,9 +264,9 @@ namespace quda { if (!param.pipeline) updateR = reliable(rNorm, maxrx, maxrr, r2, delta); if (updateR) { - if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy); + if (x.Precision() != x_sloppy.Precision()) blas::copy(x, x_sloppy); - blas::xpy(x, y); // swap these around? + blas::xpy(x, y); mat(r, y); r2 = blas::xmyNorm(b, r); @@ -299,34 +280,74 @@ namespace quda { r2 = blas::xmyNorm(b, r); } - if (x.Precision() != rSloppy.Precision()) blas::copy(rSloppy, r); - blas::zero(xSloppy); + if (x.Precision() != r_sloppy.Precision()) blas::copy(r_sloppy, r); + blas::zero(x_sloppy); rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; //r0Norm = rNorm; rUpdate++; + + just_updated = true; } k++; PrintStats("BiCGstab", k, r2, b2, heavy_quark_res); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) - printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, tmp2=%e r0=%e t2=%e\n", - blas::norm2(x), blas::norm2(rSloppy), blas::norm2(v), blas::norm2(p), - blas::norm2(tmp), blas::norm2(r0), blas::norm2(t)); + printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", + blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p), + blas::norm2(r0), blas::norm2(t)); + + converged = convergence(r2, heavy_quark_res, stop, param.tol_hq); + + if (converged) { + // make sure we've truly converged + if (!just_updated) { + if (x.Precision() != x_sloppy.Precision()) blas::copy(x, x_sloppy); + blas::xpy(x, y); + mat(r, y); + r2 = blas::xmyNorm(b, r); + + if (param.deflate && sqrt(r2) < param.tol_restart) { + // Deflate and accumulate to solution vector + eig_solve->deflate(y, r, evecs, evals, true); + // Compute r_defl = RHS - A * LHS + mat(r, y); + r2 = blas::xmyNorm(b, r); + } + + if (x.Precision() != r_sloppy.Precision()) blas::copy(r_sloppy, r); + blas::zero(x_sloppy); + + rNorm = sqrt(r2); + maxrr = rNorm; + maxrx = rNorm; + //r0Norm = rNorm; + rUpdate++; + + just_updated = true; + } + + // explicitly compute the HQ residual if need be + heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(y, r).z) : 0.0; + + // Update convergence check + converged = convergence(r2, heavy_quark_res, stop, param.tol_hq); + } // update p - if (!param.pipeline || updateR) { // need to update if not pipeline or did a reliable update + if ((!param.pipeline || updateR) && !converged) { // need to update if not pipeline or did a reliable update if (abs(rho*alpha) == 0.0) beta = 0.0; else beta = (rho/rho0) * (alpha/omega); - blas::cxpaypbz(rSloppy, -beta*omega, v, beta, p); + blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p); } } - if (x.Precision() != xSloppy.Precision()) blas::copy(x, xSloppy); - blas::xpy(y, x); + // We have a guarantee that we just converged via the true residual + // y has already been updated + blas::copy(x, y); profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); @@ -338,9 +359,8 @@ namespace quda { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("BiCGstab: Reliable updates = %d\n", rUpdate); if (!param.is_preconditioner) { // do not do the below if we this is an inner solver - // Calculate the true residual - mat(r, x); - param.true_res = sqrt(blas::xmyNorm(b, r) / b2); + // r2 was freshly computed + param.true_res = sqrt(r2 / b2); param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r).z) : 0.0; PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq); @@ -348,17 +368,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_EPILOGUE); - profile.TPSTART(QUDA_PROFILE_FREE); - if (param.precision_sloppy != x.Precision()) { - delete r_0; - delete r_sloppy; - } else if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) { - delete r_0; - } - - if (&x != &xSloppy) delete x_sloppy; - - profile.TPSTOP(QUDA_PROFILE_FREE); } } // namespace quda diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index c5884c210a..675086e699 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -166,11 +166,12 @@ TEST_P(StaggeredInvertTest, verify) inv_param.ca_basis = QUDA_POWER_BASIS; // FIXME: there's an issue in mixed precision BiCGStab I need to squash. - if (inverter_type == QUDA_BICGSTAB_INVERTER) - tol *= 1.1; + //if (inverter_type == QUDA_BICGSTAB_INVERTER) + // tol *= 1.1; - // CGNE needs a bit of a bump - if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER) + // CGNE and ASQTAD need a bit of a bump + if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER + || dslash_type == QUDA_ASQTAD_DSLASH) tol *= 1.05; for (auto rsd : solve(GetParam())) { From fc65b73eeb26ceef3a6ef4f4043d8894d987b98c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 4 Dec 2023 17:27:55 -0800 Subject: [PATCH 21/53] Various misc cleanup --- tests/host_reference/dslash_reference.cpp | 2 +- tests/staggered_dslash_test_utils.h | 33 ++++++++--------------- tests/staggered_invert_test.cpp | 7 ++--- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index c2db9993f8..19c9616288 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -752,7 +752,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda if (inv_param.solution_type == QUDA_MAT_SOLUTION) { stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); - // exact reason for this tbd, this isn't needed in the dslash test... + // correct for the massRescale function inside invertQuda if (dslash_type == QUDA_LAPLACE_DSLASH) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 246dcdfea4..39bdc09c7b 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -66,8 +66,8 @@ struct StaggeredDslashTestWrapper { static inline void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; static inline void *milc_fatlink = nullptr; static inline void *milc_longlink = nullptr; - static inline GaugeField *cpuFat = nullptr; - static inline GaugeField *cpuLong = nullptr; + static inline GaugeField cpuFat; + static inline GaugeField cpuLong; QudaParity parity = QUDA_EVEN_PARITY; @@ -77,26 +77,23 @@ struct StaggeredDslashTestWrapper { static inline bool test_split_grid = false; int num_src = 1; - // Whether or not we need the ghost zones - bool need_ghost_zone = false; - void staggeredDslashRef() { // compare to dslash reference implementation printfQuda("Calculating reference implementation..."); switch (dtest_type) { case dslash_test_type::Dslash: - stag_dslash(spinorRef, *cpuFat, *cpuLong, spinor, parity, dagger, dslash_type); + stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type); break; case dslash_test_type::MatPC: - stag_matpc(spinorRef, *cpuFat, *cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type); + stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type); break; case dslash_test_type::Mat: - stag_mat(spinorRef, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type); + stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); break; case dslash_test_type::MatDagMat: - stag_mat(tmpCpu, *cpuFat, *cpuLong, spinor, mass, dagger, dslash_type); - stag_mat(spinorRef, *cpuFat, *cpuLong, tmpCpu, mass, 1 - dagger, dslash_type); + stag_mat(tmpCpu, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); + stag_mat(spinorRef, cpuFat, cpuLong, tmpCpu, mass, 1 - dagger, dslash_type); break; default: errorQuda("Test type %d not defined", static_cast(dtest_type)); } @@ -239,13 +236,13 @@ struct StaggeredDslashTestWrapper { GaugeFieldParam cpuFatParam(gauge_param, qdp_fatlink); cpuFatParam.order = QUDA_QDP_GAUGE_ORDER; cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuFat = GaugeField::Create(cpuFatParam); + cpuFat = GaugeField(cpuFatParam); gauge_param.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(gauge_param, qdp_longlink); cpuLongParam.order = QUDA_QDP_GAUGE_ORDER; cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuLong = GaugeField::Create(cpuLongParam); + cpuLong = GaugeField(cpuLongParam); // Override link reconstruct as appropriate for staggered or asqtad if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) { @@ -283,17 +280,9 @@ struct StaggeredDslashTestWrapper { delete dirac; dirac = nullptr; } - freeGaugeQuda(); - - if (cpuFat) { - delete cpuFat; - cpuFat = nullptr; - } - if (cpuLong) { - delete cpuLong; - cpuLong = nullptr; - } + cpuFat = {}; + cpuLong = {}; commDimPartitionedReset(); } diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index be0ef6fd6b..d8775e909f 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -172,7 +172,6 @@ void init() } setDims(gauge_param.X); - // Hack: use the domain wall dimensions so we may use the 5th dim for multi indexing dw_setDims(gauge_param.X, 1); // Staggered Gauge construct START @@ -185,9 +184,9 @@ void init() gauge_param.location = QUDA_CPU_FIELD_LOCATION; GaugeFieldParam cpuParam(gauge_param); - cpuParam.create = QUDA_NULL_FIELD_CREATE; - cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuParam.order = QUDA_QDP_GAUGE_ORDER; + cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; + cpuParam.create = QUDA_NULL_FIELD_CREATE; GaugeField cpuIn = GaugeField(cpuParam); cpuFatQDP = GaugeField(cpuParam); cpuParam.order = QUDA_MILC_GAUGE_ORDER; @@ -222,6 +221,8 @@ void init() printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]); } + freeGaugeQuda(); + loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param); // now copy back to QDP aliases, since these are used for the reference dslash From 7c5c2c5600eecd547d89b9800e598eafc4184a96 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 5 Dec 2023 16:01:45 -0800 Subject: [PATCH 22/53] Fixed a verify issue for full parity solves --- tests/host_reference/dslash_reference.cpp | 2 +- tests/staggered_invert_test_gtest.hpp | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 19c9616288..d92a4cf97d 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -771,7 +771,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda } int len = 0; - if (solution_type == QUDA_MAT_SOLUTION || solution_type == QUDA_MATDAG_MAT_SOLUTION) { + if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { len = V; } else { len = Vh; diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 675086e699..18c290b82c 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -155,8 +155,6 @@ TEST_P(StaggeredInvertTest, verify) tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps if (solve_type == QUDA_NORMOP_SOLVE) tol /= (0.5 * mass); // a proxy for the condition number - } else if (solution_type == QUDA_MATDAG_MAT_SOLUTION) { - tol *= 1.05; // seems to need a bit of a bump } // The power iterations method of determining the Chebyshev window @@ -165,14 +163,9 @@ TEST_P(StaggeredInvertTest, verify) if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER) inv_param.ca_basis = QUDA_POWER_BASIS; - // FIXME: there's an issue in mixed precision BiCGStab I need to squash. - //if (inverter_type == QUDA_BICGSTAB_INVERTER) - // tol *= 1.1; - - // CGNE and ASQTAD need a bit of a bump - if (inverter_type == QUDA_CGNE_INVERTER || inverter_type == QUDA_CA_CGNE_INVERTER - || dslash_type == QUDA_ASQTAD_DSLASH) - tol *= 1.05; + // Single precision needs a tiny bump + if (prec == QUDA_SINGLE_PRECISION) + tol *= 1.01; for (auto rsd : solve(GetParam())) { if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); } From b66fc76baed0846483ff17d95851a5cb5ff8a5fe Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 6 Dec 2023 12:06:29 -0800 Subject: [PATCH 23/53] Various staggered_invert_test cleanup, made it look more like invert_test. --- tests/staggered_invert_test.cpp | 180 ++++++++++++++------------ tests/staggered_invert_test_gtest.hpp | 2 +- 2 files changed, 100 insertions(+), 82 deletions(-) diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index d8775e909f..5230300779 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -279,77 +279,30 @@ std::vector> solve(test_t param) //----------------------------------------------------------------------------------- std::vector in(Nsrc); std::vector out(Nsrc); + std::vector out_multishift(Nsrc * multishift); + quda::ColorSpinorField ref; + quda::ColorSpinorField tmp; quda::ColorSpinorParam cs_param; constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param); - for (int k = 0; k < Nsrc; k++) { - in[k] = quda::ColorSpinorField(cs_param); - out[k] = quda::ColorSpinorField(cs_param); - } - ColorSpinorField ref(cs_param); - ColorSpinorField tmp(cs_param); + ref = quda::ColorSpinorField(cs_param); + tmp = quda::ColorSpinorField(cs_param); + std::vector> _hp_multi_x(Nsrc, std::vector(multishift)); + // Staggered vector construct END //----------------------------------------------------------------------------------- - // Prepare rng - quda::RNG rng(ref, 1234); - - // Performance measuring - std::vector time(Nsrc); - std::vector gflops(Nsrc); - std::vector iter(Nsrc); - - // Populate `in` with random noise - for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM); } - - // QUDA invert test - //---------------------------------------------------------------------------- - - std::vector> res(Nsrc); + // Setup multishift parameters (if needed) + //--------------------------------------------------------------------------- - if (multishift == 1) { - if (!use_split_grid) { - for (int k = 0; k < Nsrc; k++) { - if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - invertQuda(out[k].data(), in[k].data(), &inv_param); - time[k] = inv_param.secs; - gflops[k] = inv_param.gflops / inv_param.secs; - iter[k] = inv_param.iter; - printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs, - inv_param.gflops / inv_param.secs); - } - } else { - std::vector _hp_x(Nsrc); - std::vector _hp_b(Nsrc); - for (int k = 0; k < Nsrc; k++) { - _hp_x[k] = out[k].data(); - _hp_b[k] = in[k].data(); - } - inv_param.num_src = Nsrc; - inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition; - invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(), cpuLongMILC.data(), - &gauge_param); - quda::comm_allreduce_int(inv_param.iter); - inv_param.iter /= comm_size() / num_sub_partition; - quda::comm_allreduce_sum(inv_param.gflops); - inv_param.gflops /= comm_size() / num_sub_partition; - quda::comm_allreduce_max(inv_param.secs); - printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter, - inv_param.secs, inv_param.gflops / inv_param.secs); - } + // Masses + std::vector masses(multishift); - for (int k = 0; k < Nsrc; k++) { - if (verify_results) - res[k] = verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, inv_param, 0); - } - } else if (multishift > 1) { + if (multishift > 1) { if (use_split_grid) errorQuda("Multishift currently doesn't support split grid.\n"); inv_param.num_offset = multishift; - // Prepare vectors for masses - std::vector masses(multishift); - // Consistency check for masses, tols, tols_hq size if we're setting custom values if (multishift_shifts.size() != 0) errorQuda("Multishift shifts are not supported for Wilson-type fermions"); @@ -360,47 +313,112 @@ std::vector> solve(test_t param) if (multishift_tols_hq.size() != 0 && multishift_tols_hq.size() != static_cast(multishift)) errorQuda("Multishift hq tolerance count %d does not agree with number of masses passed in %lu\n", multishift, multishift_tols_hq.size()); - // Allocate storage of output arrays - std::vector outArray(multishift); - std::vector qudaOutArray(multishift, cs_param); - - // Copy offsets and tolerances into inv_param; copy data pointers into outArray + // Copy offsets and tolerances into inv_param; allocate and copy data pointers for (int i = 0; i < multishift; i++) { masses[i] = (multishift_masses.size() == 0 ? (mass + i * i * 0.01) : multishift_masses[i]); inv_param.offset[i] = 4 * masses[i] * masses[i]; inv_param.tol_offset[i] = (multishift_tols.size() == 0 ? inv_param.tol : multishift_tols[i]); inv_param.tol_hq_offset[i] = (multishift_tols_hq.size() == 0 ? inv_param.tol_hq : multishift_tols_hq[i]); - outArray[i] = qudaOutArray[i].data(); + // Allocate memory and set pointers + for (int n = 0; n < Nsrc; n++) { + out_multishift[n * multishift + i] = quda::ColorSpinorField(cs_param); + _hp_multi_x[n][i] = out_multishift[n * multishift + i].data(); + } logQuda(QUDA_VERBOSE, "Multishift mass %d = %e ; tolerance %e ; hq tolerance %e\n", i, masses[i], inv_param.tol_offset[i], inv_param.tol_hq_offset[i]); } + } - for (int k = 0; k < Nsrc; k++) { - quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM); - invertMultiShiftQuda((void **)outArray.data(), in[k].data(), &inv_param); + // Setup multishift parameters END + //----------------------------------------------------------------------------------- - time[k] = inv_param.secs; - gflops[k] = inv_param.gflops / inv_param.secs; - iter[k] = inv_param.iter; - printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs, - inv_param.gflops / inv_param.secs); + // Prepare rng, fill host spinors with random numbers + //----------------------------------------------------------------------------------- + + std::vector time(Nsrc); + std::vector gflops(Nsrc); + std::vector iter(Nsrc); + quda::RNG rng(ref, 1234); + + for (int n = 0; n < Nsrc; n++) { + // Populate the host spinor with random numbers. + in[n] = quda::ColorSpinorField(cs_param); + quda::spinorNoise(in[n], rng, QUDA_NOISE_UNIFORM); + out[n] = quda::ColorSpinorField(cs_param); + } - for (int i = 0; i < multishift; i++) { - printfQuda("%dth solution: mass=%f, ", i, masses[i]); - auto resid = verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i); + // Prepare rng, fill host spinors with random numbers END + //----------------------------------------------------------------------------------- - // take the HQ residual from the lightest mass - if (i == 0) { - res[k] = resid; + // QUDA invert test + //---------------------------------------------------------------------------- + + std::vector> res(Nsrc); + + if (!use_split_grid) { + + for (int n = 0; n < Nsrc; n++) { + // If deflating, preserve the deflation space between solves + if (inv_deflate) eig_param.preserve_deflation = n < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + // Perform QUDA inversions + if (multishift > 1) { + invertMultiShiftQuda(_hp_multi_x[n].data(), in[n].data(), &inv_param); + } else { + invertQuda(out[n].data(), in[n].data(), &inv_param); + } + + time[n] = inv_param.secs; + gflops[n] = inv_param.gflops / inv_param.secs; + iter[n] = inv_param.iter; + printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs, + inv_param.gflops / inv_param.secs); + + if (verify_results) { + if (multishift > 1) { + for (int i = 0; i < multishift; i++) { + printfQuda("%dth solution: mass=%f, ", i, masses[i]); + auto resid = verifyStaggeredInversion(tmp, ref, in[n], out_multishift[n * multishift + i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i); + + // take the HQ residual from the lightest mass + if (i == 0) { + res[n] = resid; + } else { + if (resid[0] > res[n][0]) res[n][0] = resid[0]; + } + } } else { - if (resid[0] > res[k][0]) res[k][0] = resid[0]; + res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0); } } } } else { - errorQuda("Invalid number of shifts %d", multishift); + inv_param.num_src = Nsrc; + inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition; + // Host arrays for solutions, sources, and check + std::vector _hp_x(Nsrc); + std::vector _hp_b(Nsrc); + for (int n = 0; n < Nsrc; n++) { + _hp_x[n] = out[n].data(); + _hp_b[n] = in[n].data(); + } + // Run split grid + invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(), cpuLongMILC.data(), + &gauge_param); + + quda::comm_allreduce_int(inv_param.iter); + inv_param.iter /= comm_size() / num_sub_partition; + quda::comm_allreduce_sum(inv_param.gflops); + inv_param.gflops /= comm_size() / num_sub_partition; + quda::comm_allreduce_max(inv_param.secs); + printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter, + inv_param.secs, inv_param.gflops / inv_param.secs); + + for (int n = 0; n < Nsrc; n++) { + if (verify_results) + res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0); + } } // Free the multigrid solver diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 18c290b82c..4617c1e85d 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -233,7 +233,7 @@ INSTANTIATE_TEST_SUITE_P(EvenOdd, StaggeredInvertTest, // full system normal solve INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredInvertTest, - Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE), + Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE), sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), gettestname); From a5b89ebedbf7b254d34bc0889d6cb138e8b5167c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 6 Dec 2023 13:23:00 -0800 Subject: [PATCH 24/53] Updated verifyStaggeredInversion to look like the regular verifyInversion routine --- tests/host_reference/dslash_reference.cpp | 115 +++++++++++++++------- tests/host_reference/dslash_reference.h | 8 +- tests/staggered_invert_test.cpp | 40 +++----- 3 files changed, 98 insertions(+), 65 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index d92a4cf97d..b2688b0094 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -744,59 +744,98 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou } std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int shift) + quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param) { + std::vector out_vector(1); + out_vector[0] = out; + return verifyStaggeredInversion(tmp, ref, in, out_vector, fat_link, + long_link, inv_param); +} + +std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, + std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param) { int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; + double l2r_max = 0.0; + double hqr_max = 0.0; + if (multishift > 1) { + if (dslash_type == QUDA_LAPLACE_DSLASH) + errorQuda("Multishift solves do not support the laplace operator (yet)"); - if (inv_param.solution_type == QUDA_MAT_SOLUTION) { - stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); + if (inv_param.solution_type != QUDA_MATPC_SOLUTION) + errorQuda("Invalid staggered multishift solution type %d, expected QUDA_MATPC_SOLUTION", inv_param.solution_type); - // correct for the massRescale function inside invertQuda - if (dslash_type == QUDA_LAPLACE_DSLASH) - ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); - } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { + // Check the mat_pc type and make sure it's sane QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } - stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type); - } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { - stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type); - stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); - } else { - errorQuda("Invalid staggered solution type %d", inv_param.solution_type); - } - int len = 0; - if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { - len = V; - } else { - len = Vh; - } + for (int i = 0; i < multishift; i++) { + auto& out = out_vector[i]; + double mass = 0.5 * sqrt(inv_param.offset[i]); + stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type); - mxpy(in.data(), ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec); - double nrm2 = norm_2(ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec); - double src2 = norm_2(in.data(), len * stag_spinor_site_size, inv_param.cpu_prec); - double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z); - double l2r = sqrt(nrm2 / src2); + mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double src2 = norm_2(in.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z); + double l2r = sqrt(nrm2 / src2); + + printfQuda("%dth solution: mass=%f, ", i, mass); + printfQuda("Shift %2d residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, " + "QUDA = %9.6e, host = %9.6e\n", + i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, + inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr); + // Empirical: if the cpu residue is more than 1 order the target accuracy, then it fails to converge + if (sqrt(nrm2 / src2) > 10 * inv_param.tol_offset[i]) { + printfQuda("Shift %2d has empirically failed to converge\n", i); + } + + l2r_max = std::max(l2r_max, l2r); + hqr_max = std::max(hqr_max, hqr); + } - if (multishift == 1) { - printfQuda("Residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, QUDA = %9.6e, " - "host = %9.6e\n", - inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); } else { - printfQuda("Shift %2d residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, " - "QUDA = %9.6e, host = %9.6e\n", - shift, inv_param.tol_offset[shift], inv_param.true_res_offset[shift], l2r, - inv_param.tol_hq_offset[shift], inv_param.true_res_hq_offset[shift], hqr); - // Empirical: if the cpu residue is more than 1 order the target accuracy, then it fails to converge - if (sqrt(nrm2 / src2) > 10 * inv_param.tol_offset[shift]) { - printfQuda("Shift %2d has empirically failed to converge\n", shift); + auto& out = out_vector[0]; + double mass = inv_param.mass; + if (inv_param.solution_type == QUDA_MAT_SOLUTION) { + stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); + + // correct for the massRescale function inside invertQuda + if (dslash_type == QUDA_LAPLACE_DSLASH) + ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); + } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { + QudaParity parity = QUDA_INVALID_PARITY; + switch (inv_param.matpc_type) { + case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; + case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; + default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; + } + stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type); + } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { + stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type); + stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); + } else { + errorQuda("Invalid staggered solution type %d", inv_param.solution_type); } + + mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double src2 = norm_2(in.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z); + double l2r = sqrt(nrm2 / src2); + + printfQuda("Residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, QUDA = %9.6e, " + "host = %9.6e\n", + inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); + + l2r_max = l2r; + hqr_max = hqr; } - return {l2r, inv_param.tol_hq}; + return {l2r_max, hqr_max}; } diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index b17238bac4..85fc096ff6 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -110,8 +110,12 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv); std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int shift); + quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param); + +std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, + std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param); // i represents a "half index" into an even or odd "half lattice". // when oddBit={0,1} the half lattice is {even,odd}. diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 5230300779..373d78b4f7 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -355,8 +355,6 @@ std::vector> solve(test_t param) // QUDA invert test //---------------------------------------------------------------------------- - std::vector> res(Nsrc); - if (!use_split_grid) { for (int n = 0; n < Nsrc; n++) { @@ -374,24 +372,6 @@ std::vector> solve(test_t param) iter[n] = inv_param.iter; printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs, inv_param.gflops / inv_param.secs); - - if (verify_results) { - if (multishift > 1) { - for (int i = 0; i < multishift; i++) { - printfQuda("%dth solution: mass=%f, ", i, masses[i]); - auto resid = verifyStaggeredInversion(tmp, ref, in[n], out_multishift[n * multishift + i], masses[i], cpuFatQDP, cpuLongQDP, inv_param, i); - - // take the HQ residual from the lightest mass - if (i == 0) { - res[n] = resid; - } else { - if (resid[0] > res[n][0]) res[n][0] = resid[0]; - } - } - } else { - res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0); - } - } } } else { inv_param.num_src = Nsrc; @@ -414,11 +394,6 @@ std::vector> solve(test_t param) quda::comm_allreduce_max(inv_param.secs); printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter, inv_param.secs, inv_param.gflops / inv_param.secs); - - for (int n = 0; n < Nsrc; n++) { - if (verify_results) - res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], mass, cpuFatQDP, cpuLongQDP, inv_param, 0); - } } // Free the multigrid solver @@ -427,6 +402,21 @@ std::vector> solve(test_t param) // Compute timings if (Nsrc > 1 && !use_split_grid) performanceStats(time, gflops, iter); + std::vector> res(Nsrc); + // Perform host side verification of inversion if requested + if (verify_results) { + for (int n = 0; n < Nsrc; n++) { + if (multishift > 1) { + printfQuda("\nSource %d:\n", n); + // Create an appropriate subset of the full out_multishift vector + std::vector out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift}; + res[n] = verifyStaggeredInversion(tmp, ref, in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param); + } else { + res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param); + } + } + } + return res; } From 6e339612f377211f7105b5b8b4ff3d02fc7abb36 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 6 Dec 2023 14:57:00 -0800 Subject: [PATCH 25/53] Refactored staggered_eigensolve_test to look more like eigensolve_test, working towards a ctest --- tests/staggered_eigensolve_test.cpp | 351 ++++++++++++++++++---------- 1 file changed, 229 insertions(+), 122 deletions(-) diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 797dcb1311..c458cb4aaf 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -15,125 +15,116 @@ #include #include -#define MAX(a, b) ((a) > (b) ? (a) : (b)) +QudaGaugeParam gauge_param; +QudaInvertParam eig_inv_param; +QudaEigParam eig_param; -void display_test_info() +// if "--enable-testing true" is passed, we run the tests defined in here +//#include + +void display_test_info(QudaEigParam ¶m) { printfQuda("running the following test:\n"); - printfQuda("prec sloppy_prec link_recon sloppy_link_recon S_dimension T_dimension\n"); - printfQuda("%s %s %s %s %d/%d/%d %d \n", get_prec_str(prec), - get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), - xdim, ydim, zdim, tdim); + + printfQuda("prec sloppy_prec link_recon sloppy_link_recon S_dimension T_dimension Ls_dimension\n"); + printfQuda("%s %s %s %s %d/%d/%d %d %d\n", get_prec_str(prec), + get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim, + tdim, Lsdim); printfQuda("\n Eigensolver parameters\n"); - printfQuda(" - solver mode %s\n", get_eig_type_str(eig_type)); - printfQuda(" - spectrum requested %s\n", get_eig_spectrum_str(eig_spectrum)); - if (eig_type == QUDA_EIG_BLK_TR_LANCZOS) printfQuda(" - eigenvector block size %d\n", eig_block_size); - printfQuda(" - number of eigenvectors requested %d\n", eig_n_conv); - printfQuda(" - size of eigenvector search space %d\n", eig_n_ev); - printfQuda(" - size of Krylov space %d\n", eig_n_kr); - printfQuda(" - solver tolerance %e\n", eig_tol); - printfQuda(" - convergence required (%s)\n", eig_require_convergence ? "true" : "false"); - if (eig_compute_svd) { + printfQuda(" - solver mode %s\n", get_eig_type_str(param.eig_type)); + printfQuda(" - spectrum requested %s\n", get_eig_spectrum_str(param.spectrum)); + if (param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) printfQuda(" - eigenvector block size %d\n", param.block_size); + printfQuda(" - number of eigenvectors requested %d\n", param.n_conv); + printfQuda(" - size of eigenvector search space %d\n", param.n_ev); + printfQuda(" - size of Krylov space %d\n", param.n_kr); + printfQuda(" - solver tolerance %e\n", param.tol); + printfQuda(" - convergence required (%s)\n", param.require_convergence ? "true" : "false"); + if (param.compute_svd) { printfQuda(" - Operator: MdagM. Will compute SVD of M\n"); printfQuda(" - ***********************************************************\n"); printfQuda(" - **** Overriding any previous choices of operator type. ****\n"); printfQuda(" - **** SVD demands normal operator, will use MdagM ****\n"); printfQuda(" - ***********************************************************\n"); } else { - printfQuda(" - Operator: daggered (%s) , norm-op (%s)\n", eig_use_dagger ? "true" : "false", - eig_use_normop ? "true" : "false"); + printfQuda(" - Operator: daggered (%s) , norm-op (%s), even-odd pc (%s)\n", param.use_dagger ? "true" : "false", + param.use_norm_op ? "true" : "false", param.use_pc ? "true" : "false"); } - if (eig_use_poly_acc) { - printfQuda(" - Chebyshev polynomial degree %d\n", eig_poly_deg); - printfQuda(" - Chebyshev polynomial minumum %e\n", eig_amin); - if (eig_amax < 0) + if (param.use_poly_acc) { + printfQuda(" - Chebyshev polynomial degree %d\n", param.poly_deg); + printfQuda(" - Chebyshev polynomial minumum %e\n", param.a_min); + if (param.a_max <= 0) printfQuda(" - Chebyshev polynomial maximum will be computed\n"); else - printfQuda(" - Chebyshev polynomial maximum %e\n\n", eig_amax); + printfQuda(" - Chebyshev polynomial maximum %e\n\n", param.a_max); } - printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), dimPartitioned(3)); } -int main(int argc, char **argv) -{ - // Set defaults - setQudaStaggeredDefaultInvTestParams(); - - auto app = make_app(); - add_eigen_option_group(app); - - try { - app->parse(argc, argv); - } catch (const CLI::ParseError &e) { - return app->exit(e); - } - - // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp) - initComms(argc, argv, gridsize_from_cmdline); - - // Set values for precisions via the command line. - setQudaPrecisions(); - - // Only these fermions are supported in this file - if (is_laplace_enabled) { - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); - } else { - if (dslash_type == QUDA_LAPLACE_DSLASH) - errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); - } - - display_test_info(); +GaugeField cpuFatQDP = {}; +GaugeField cpuLongQDP = {}; +GaugeField cpuFatMILC = {}; +GaugeField cpuLongMILC = {}; +void init() +{ // Set QUDA internal parameters - QudaGaugeParam gauge_param = newQudaGaugeParam(); + gauge_param = newQudaGaugeParam(); setStaggeredGaugeParam(gauge_param); + // Though no inversions are performed, the inv_param // structure contains all the information we need to - // construct the dirac operator. We encapsualte the - // inv_param structure inside the eig_param structure - // to avoid any confusion - QudaInvertParam eig_inv_param = newQudaInvertParam(); + // construct the dirac operator. + eig_inv_param = newQudaInvertParam(); setStaggeredInvertParam(eig_inv_param); - QudaEigParam eig_param = newQudaEigParam(); - setEigParam(eig_param); - // We encapsulate the eigensolver parameters inside the invert parameter structure - eig_param.invert_param = &eig_inv_param; - - if (eig_param.arpack_check && !(prec == QUDA_DOUBLE_PRECISION)) { - errorQuda("ARPACK check only available in double precision"); - } - initQuda(device_ordinal); + eig_param = newQudaEigParam(); + // We encapsualte the inv_param structure inside the eig_param structure + eig_param.invert_param = &eig_inv_param; + setEigParam(eig_param); setDims(gauge_param.X); - dw_setDims(gauge_param.X, 1); // so we can use 5-d indexing from dwf + dw_setDims(gauge_param.X, 1); // Staggered Gauge construct START //----------------------------------------------------------------------------------- - void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *milc_fatlink = nullptr; - void *milc_longlink = nullptr; - - for (int dir = 0; dir < 4; dir++) { - qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_fatlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - } - milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - + // Allocate host staggered gauge fields + gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ? + QUDA_SU3_LINKS : + QUDA_ASQTAD_FAT_LINKS; + gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + gauge_param.location = QUDA_CPU_FIELD_LOCATION; + + GaugeFieldParam cpuParam(gauge_param); + cpuParam.order = QUDA_QDP_GAUGE_ORDER; + cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; + cpuParam.create = QUDA_NULL_FIELD_CREATE; + GaugeField cpuIn = GaugeField(cpuParam); + cpuFatQDP = GaugeField(cpuParam); + cpuParam.order = QUDA_MILC_GAUGE_ORDER; + cpuFatMILC = GaugeField(cpuParam); + + cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS; + cpuParam.nFace = 3; + cpuParam.order = QUDA_QDP_GAUGE_ORDER; + cpuLongQDP = GaugeField(cpuParam); + cpuParam.order = QUDA_MILC_GAUGE_ORDER; + cpuLongMILC = GaugeField(cpuParam); + + void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)}; + void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)}; + void *qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)}; constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, true); + // Reorder gauge fields to MILC order + cpuFatMILC = cpuFatQDP; + cpuLongMILC = cpuLongQDP; + // Compute plaquette. Routine is aware that the gauge fields already have the phases on them. + // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the + // gauge fields with different parameters. double plaq[3]; computeStaggeredPlaquetteQDPOrder(qdp_inlink, plaq, gauge_param, dslash_type); printfQuda("Computed plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]); @@ -144,60 +135,176 @@ int main(int argc, char **argv) printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]); } - // Reorder gauge fields to MILC order - reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + freeGaugeQuda(); + + loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param); - loadFatLongGaugeQuda(milc_fatlink, milc_longlink, gauge_param); + // now copy back to QDP aliases, since these are used for the reference dslash + cpuFatQDP = cpuFatMILC; + cpuLongQDP = cpuLongMILC; + // ensure QDP alias has exchanged ghosts + cpuFatQDP.exchangeGhost(); + cpuLongQDP.exchangeGhost(); // Staggered Gauge construct END //----------------------------------------------------------------------------------- +} +//std::vector eigensolve(test_t test_param) +std::vector eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd, QudaEigSpectrumType spectrum) +{ + // Collect testing parameters from gtest + eig_param.eig_type = eig_type; //::testing::get<0>(test_param); + eig_param.use_norm_op = use_norm_op; //::testing::get<1>(test_param); + eig_param.use_pc = use_pc; //::testing::get<2>(test_param); + eig_param.compute_svd = compute_svd; //::testing::get<3>(test_param); + eig_param.spectrum = spectrum; //::testing::get<4>(test_param); + + if (eig_param.use_pc) + eig_inv_param.solution_type = QUDA_MATPC_SOLUTION; + else + eig_inv_param.solution_type = QUDA_MAT_SOLUTION; + + // For gtest testing, we prohibit the use of polynomial acceleration as + // the fine tuning required can inhibit convergence of an otherwise + // perfectly good algorithm. We also have a default value of 4 + // for the block size in Block TRLM, and 4 for the batched rotation. + // The user may change these values via the command line: + // --eig-block-size + // --eig-batched-rotate + if (enable_testing) { + eig_use_poly_acc = false; + eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE; + eig_block_size != 4 ? eig_param.block_size = eig_block_size : eig_param.block_size = 4; + eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 4; + } + + logQuda(QUDA_SUMMARIZE, "Action = %s, Solver = %s, norm-op = %s, even-odd = %s, with SVD = %s, spectrum = %s\n", + get_dslash_str(dslash_type), + get_eig_type_str(eig_param.eig_type), eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? "true" : "false", + eig_param.use_pc == QUDA_BOOLEAN_TRUE ? "true" : "false", + eig_param.compute_svd == QUDA_BOOLEAN_TRUE ? "true" : "false", get_eig_spectrum_str(eig_param.spectrum)); + + display_test_info(eig_param); + + // Vector construct START + //---------------------------------------------------------------------------- // Host side arrays to store the eigenpairs computed by QUDA - void **host_evecs = (void **)safe_malloc(eig_n_conv * sizeof(void *)); - for (int i = 0; i < eig_n_conv; i++) { - host_evecs[i] = (void *)safe_malloc(V * stag_spinor_site_size * eig_inv_param.cpu_prec); + int n_eig = eig_n_conv; + if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2; + std::vector evecs(n_eig); + quda::ColorSpinorParam cs_param; + constructStaggeredTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param); + // Void pointers to host side arrays, compatible with the QUDA interface. + std::vector host_evecs_ptr(n_eig); + // Allocate host side memory and pointers + for (int i = 0; i < n_eig; i++) { + evecs[i] = quda::ColorSpinorField(cs_param); + host_evecs_ptr[i] = evecs[i].data(); } - double _Complex *host_evals = (double _Complex *)safe_malloc(eig_param.n_ev * sizeof(double _Complex)); - double time = 0.0; + // Complex eigenvalues + std::vector<__complex__ double> evals(eig_n_conv); + // Vector construct END + //---------------------------------------------------------------------------- - // QUDA eigensolver test + // QUDA eigensolver test BEGIN //---------------------------------------------------------------------------- - if ((solve_type == QUDA_DIRECT_SOLVE && solution_type == QUDA_MAT_SOLUTION) || - (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type == QUDA_MATPC_SOLUTION) || - (solve_type == QUDA_NORMOP_SOLVE && solution_type == QUDA_MATDAG_MAT_SOLUTION)) { - // This function returns the host_evecs and host_evals pointers, populated with - // the requested data, at the requested prec. All the information needed to - // perfom the solve is in the eig_param container. - // If eig_param.arpack_check == true and precision is double, the routine will - // use ARPACK rather than the GPU. - - time = -((double)clock()); - eigensolveQuda(host_evecs, host_evals, &eig_param); - time += (double)clock(); - - printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", time / CLOCKS_PER_SEC); - } else { - errorQuda("Unsupported combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type)); + // This function returns the host_evecs and host_evals pointers, populated with the + // requested data, at the requested prec. All the information needed to perfom the + // solve is in the eig_param container. If eig_param.arpack_check == true and + // precision is double, the routine will use ARPACK rather than the GPU. + quda::host_timer_t host_timer; + host_timer.start(); + eigensolveQuda(host_evecs_ptr.data(), evals.data(), &eig_param); + host_timer.stop(); + printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last()); + + // Perform host side verification of eigenvector if requested. + // ... + + std::vector residua(eig_n_conv, 0.0); + return residua; + // QUDA eigensolver test COMPLETE + //---------------------------------------------------------------------------- +} + +void cleanup() +{ + cpuFatQDP = {}; + cpuLongQDP = {}; + cpuFatMILC = {}; + cpuLongMILC = {}; +} + +int main(int argc, char **argv) +{ + // Set defaults + setQudaStaggeredDefaultInvTestParams(); + + auto app = make_app(); + add_eigen_option_group(app); + //add_testing_option_group(app); + try { + app->parse(argc, argv); + } catch (const CLI::ParseError &e) { + return app->exit(e); + } + setVerbosity(verbosity); - } // switch + // Set values for precisions via the command line. + setQudaPrecisions(); + + // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp) + initComms(argc, argv, gridsize_from_cmdline); + + initRand(); + + // Only these fermions are supported in this file + if (is_laplace_enabled) { + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } else { + if (dslash_type == QUDA_LAPLACE_DSLASH) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + } - // Deallocate host memory - for (int i = 0; i < eig_n_conv; i++) host_free(host_evecs[i]); - host_free(host_evecs); - host_free(host_evals); + if (eig_param.arpack_check && !(prec == QUDA_DOUBLE_PRECISION)) { + errorQuda("ARPACK check only available in double precision"); + } - // Clean up gauge fields. - for (int dir = 0; dir < 4; dir++) { - host_free(qdp_inlink[dir]); - host_free(qdp_fatlink[dir]); - host_free(qdp_longlink[dir]); + // Sanity check combinations of solve type and solution type + if ((solve_type == QUDA_DIRECT_SOLVE && solution_type != QUDA_MAT_SOLUTION) || + (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type != QUDA_MATPC_SOLUTION) || + (solve_type == QUDA_NORMOP_SOLVE && solution_type != QUDA_MATDAG_MAT_SOLUTION)) { + errorQuda("Invalid combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type)); } - host_free(milc_fatlink); - host_free(milc_longlink); + initQuda(device_ordinal); + + init(); + + int result = 0; + //if (enable_testing) { // tests are defined in invert_test_gtest.hpp + //::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); + //if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } + //result = RUN_ALL_TESTS(); + //} else { + //eigensolve( + // test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum}); + eigensolve(eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum); + //} + cleanup(); + + // Memory clean-up + freeGaugeQuda(); + + // Finalize the QUDA library endQuda(); finalizeComms(); + + return result; } From 829ce62c6e7fdd5bba8429ef77b371223f42d1ab Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 6 Dec 2023 21:09:30 -0800 Subject: [PATCH 26/53] Abstracted the staggered eigensolver test into a gtest. The tests and their parameters converge (slowly), but there is no verify yet. --- tests/staggered_eigensolve_test.cpp | 61 +++++--- tests/staggered_eigensolve_test_gtest.hpp | 176 ++++++++++++++++++++++ tests/staggered_invert_test.cpp | 28 ++-- 3 files changed, 229 insertions(+), 36 deletions(-) create mode 100644 tests/staggered_eigensolve_test_gtest.hpp diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index c458cb4aaf..aab4a99c1f 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -20,7 +20,7 @@ QudaInvertParam eig_inv_param; QudaEigParam eig_param; // if "--enable-testing true" is passed, we run the tests defined in here -//#include +#include void display_test_info(QudaEigParam ¶m) { @@ -150,15 +150,14 @@ void init() //----------------------------------------------------------------------------------- } -//std::vector eigensolve(test_t test_param) -std::vector eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd, QudaEigSpectrumType spectrum) +std::vector eigensolve(test_t test_param) { // Collect testing parameters from gtest - eig_param.eig_type = eig_type; //::testing::get<0>(test_param); - eig_param.use_norm_op = use_norm_op; //::testing::get<1>(test_param); - eig_param.use_pc = use_pc; //::testing::get<2>(test_param); - eig_param.compute_svd = compute_svd; //::testing::get<3>(test_param); - eig_param.spectrum = spectrum; //::testing::get<4>(test_param); + eig_param.eig_type = ::testing::get<0>(test_param); + eig_param.use_norm_op = ::testing::get<1>(test_param); + eig_param.use_pc = ::testing::get<2>(test_param); + eig_param.compute_svd = ::testing::get<3>(test_param); + eig_param.spectrum = ::testing::get<4>(test_param); if (eig_param.use_pc) eig_inv_param.solution_type = QUDA_MATPC_SOLUTION; @@ -175,7 +174,6 @@ std::vector eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, Qu if (enable_testing) { eig_use_poly_acc = false; eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE; - eig_block_size != 4 ? eig_param.block_size = eig_block_size : eig_param.block_size = 4; eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 4; } @@ -185,7 +183,8 @@ std::vector eigensolve(QudaEigType eig_type, QudaBoolean use_norm_op, Qu eig_param.use_pc == QUDA_BOOLEAN_TRUE ? "true" : "false", eig_param.compute_svd == QUDA_BOOLEAN_TRUE ? "true" : "false", get_eig_spectrum_str(eig_param.spectrum)); - display_test_info(eig_param); + if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) + display_test_info(eig_param); // Vector construct START //---------------------------------------------------------------------------- @@ -239,12 +238,13 @@ void cleanup() int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); // Set defaults setQudaStaggeredDefaultInvTestParams(); auto app = make_app(); add_eigen_option_group(app); - //add_testing_option_group(app); + add_testing_option_group(app); try { app->parse(argc, argv); } catch (const CLI::ParseError &e) { @@ -284,18 +284,39 @@ int main(int argc, char **argv) initQuda(device_ordinal); + if (enable_testing) { + // We need to force a well-behaved operator + reasonable convergence, otherwise + // the staggered tests will fail. These checks are designed to be consistent + // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked" + bool changes = false; + if (!compute_fatlong) { compute_fatlong = true; changes = true; } + + double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-4 : 1e-5; + if (eig_tol != expected_tol) { eig_tol = expected_tol; changes = true; } + if (niter != 1000) { niter = 1000; changes = true; } + if (eig_n_kr != 256) { eig_n_kr = 256; changes = true; } + if (eig_block_size != 8) { eig_block_size = 8; } + + if (changes) { + printfQuda("For gtest, various defaults are changed:\n"); + printfQuda(" --compute-fat-long true\n"); + printfQuda(" --eig-tol (1e-5 for double, 1e-4 for single)\n"); + printfQuda(" --niter 1000\n"); + printfQuda(" --eig-n-kr 256\n"); + } + } + init(); int result = 0; - //if (enable_testing) { // tests are defined in invert_test_gtest.hpp - //::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); - //if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } - //result = RUN_ALL_TESTS(); - //} else { - //eigensolve( - // test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum}); - eigensolve(eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum); - //} + if (enable_testing) { // tests are defined in invert_test_gtest.hpp + ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); + if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } + result = RUN_ALL_TESTS(); + } else { + eigensolve( + test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum}); + } cleanup(); diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp new file mode 100644 index 0000000000..06cf712e07 --- /dev/null +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -0,0 +1,176 @@ +#include + +using test_t = ::testing::tuple; + +class StaggeredEigensolveTest : public ::testing::TestWithParam +{ +protected: + test_t param; + +public: + StaggeredEigensolveTest() : param(GetParam()) { } +}; + +// Get the solve type that this combination corresponds to +QudaSolveType get_solve_type(QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd) { + if (use_norm_op == QUDA_BOOLEAN_FALSE && use_pc == QUDA_BOOLEAN_TRUE && compute_svd == QUDA_BOOLEAN_FALSE) + return QUDA_DIRECT_PC_SOLVE; + else if (use_norm_op == QUDA_BOOLEAN_TRUE && use_pc == QUDA_BOOLEAN_FALSE && compute_svd == QUDA_BOOLEAN_TRUE) + return QUDA_NORMOP_SOLVE; + else if (use_norm_op == QUDA_BOOLEAN_FALSE && use_pc == QUDA_BOOLEAN_FALSE && compute_svd == QUDA_BOOLEAN_FALSE) + return QUDA_DIRECT_SOLVE; + else + return QUDA_INVALID_SOLVE; +} + +bool skip_test(test_t test_param) +{ + auto eig_type = ::testing::get<0>(test_param); + auto use_norm_op = ::testing::get<1>(test_param); + auto use_pc = ::testing::get<2>(test_param); + auto compute_svd = ::testing::get<3>(test_param); + auto spectrum = ::testing::get<4>(test_param); + + // Reverse engineer the operator type + QudaSolveType combo_solve_type = get_solve_type(use_norm_op, use_pc, compute_svd); + if (combo_solve_type == QUDA_DIRECT_PC_SOLVE) { + // matpc + + // this is only legal for the staggered and asqtad op + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + return true; + + // we can only compute the real part for Lanczos, and real or magnitude for Arnoldi + switch (eig_type) { + case QUDA_EIG_TR_LANCZOS: + case QUDA_EIG_BLK_TR_LANCZOS: + if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true; + break; + case QUDA_EIG_IR_ARNOLDI: + if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true; + break; + default: break; + } + } else if (combo_solve_type == QUDA_NORMOP_SOLVE) { + // matdag_mat + + // this is only legal for the staggered and asqtad op + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + return true; + + switch (eig_type) { + case QUDA_EIG_TR_LANCZOS: + case QUDA_EIG_BLK_TR_LANCZOS: + if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true; + break; + case QUDA_EIG_IR_ARNOLDI: + //if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true; + return true; // we skip this because it takes an unnecessarily long time and it's covered elsewhere + break; + default: return true; break; + } + } else if (combo_solve_type == QUDA_DIRECT_SOLVE) { + // mat + + switch (dslash_type) { + case QUDA_STAGGERED_DSLASH: + // only Arnoldi, imaginary part or magnitude works (real part is degenerate) + // We skip SM because it takes an unnecessarily long time and it's + // covered by HISQ + if (eig_type != QUDA_EIG_IR_ARNOLDI) return true; + if (spectrum != QUDA_SPECTRUM_LI_EIG && spectrum != QUDA_SPECTRUM_SI_EIG && + spectrum != QUDA_SPECTRUM_LM_EIG) return true; + break; + case QUDA_ASQTAD_DSLASH: + // only Arnoldi, imaginary part or magnitude works (real part is degenerate) + if (eig_type != QUDA_EIG_IR_ARNOLDI) return true; + if (spectrum == QUDA_SPECTRUM_LR_EIG || spectrum == QUDA_SPECTRUM_SR_EIG) return true; + break; + case QUDA_LAPLACE_DSLASH: + switch (eig_type) { + case QUDA_EIG_TR_LANCZOS: + case QUDA_EIG_BLK_TR_LANCZOS: + if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true; + break; + case QUDA_EIG_IR_ARNOLDI: + if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true; + break; + default: return true; break; + } + break; + default: return true; break; + } + } + + return false; +} + +std::vector eigensolve(test_t test_param); + +TEST_P(StaggeredEigensolveTest, verify) +{ + if (skip_test(GetParam())) GTEST_SKIP(); + double factor = 1.0; + // The IRAM eigensolver will sometimes report convergence with tolerances slightly + // higher than requested. The same phenomenon occurs in ARPACK. This factor + // prevents failure when IRAM has solved to say 2e-6 when 1e-6 is requested. + // The solution to avoid this is to use a Krylov space (eig-n-kr) about 3-4 times the + // size of the search space (eig-n-ev), or use a well chosen Chebyshev polynomial, + // or use a tighter than necessary tolerance. + if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10; + auto tol = factor * eig_param.tol; + for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol); +} + +std::string gettestname(::testing::TestParamInfo param) +{ + std::string name; + name += get_eig_type_str(::testing::get<0>(param.param)) + std::string("_"); + name += (::testing::get<1>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("normop") : std::string("direct")) + + std::string("_"); + name += (::testing::get<2>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("evenodd") : std::string("full")) + + std::string("_"); + name += (::testing::get<3>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("withSVD") : std::string("noSVD")) + + std::string("_"); + name += get_eig_spectrum_str(::testing::get<4>(param.param)); + return name; +} + +using ::testing::Combine; +using ::testing::Values; + +// Can solve hermitian systems +auto hermitian_solvers = Values(QUDA_EIG_TR_LANCZOS, QUDA_EIG_BLK_TR_LANCZOS, QUDA_EIG_IR_ARNOLDI); + +// Can solve non-hermitian systems +auto non_hermitian_solvers = Values(QUDA_EIG_IR_ARNOLDI); + +// Eigensolver spectrum types +auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG); +auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG, + QUDA_SPECTRUM_SM_EIG, QUDA_SPECTRUM_LI_EIG, QUDA_SPECTRUM_SI_EIG); + +//using test_t = ::testing::tuple; // Largest real, smallest real, etc + +// Preconditioned direct operators, which are HPD for staggered! +INSTANTIATE_TEST_SUITE_P(DirectEvenOdd, StaggeredEigensolveTest, + ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_TRUE), + Values(QUDA_BOOLEAN_FALSE), hermitian_spectrum), + gettestname); + +// full system normal solve +INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredEigensolveTest, + ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_FALSE), + Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum), + gettestname); + + +// full system direct solve +INSTANTIATE_TEST_SUITE_P(DirectFull, StaggeredEigensolveTest, + ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), + Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum), + gettestname); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 373d78b4f7..8f4236d16c 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -490,24 +490,20 @@ int main(int argc, char **argv) // We need to force a well-behaved operator + reasonable convergence, otherwise // the staggered tests will fail. These checks are designed to be consistent // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked" - if (!compute_fatlong) { - warningQuda("compute_fatlong = %d , expected value %d , overriding", compute_fatlong, true); - compute_fatlong = true; - } + bool changes = false; + if (!compute_fatlong) { compute_fatlong = true; changes = true; } double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-5 : 1e-6; - if (tol != expected_tol) { - warningQuda("tol = %e , expected value %e , overriding", tol, expected_tol); - tol = expected_tol; - } - if (tol_hq != expected_tol) { - warningQuda("tol_hq = %e , expected value %e , overriding", tol_hq, expected_tol); - tol_hq = 1e-5; - } - - if (niter != 1000) { - warningQuda("niter = %d , expected value %d , overriding", niter, 1000); - compute_fatlong = 1000; + if (tol != expected_tol) { tol = expected_tol; changes = true; } + if (tol_hq != expected_tol) { tol_hq = expected_tol; changes = true; } + if (niter != 1000) { niter = 1000; changes = true; } + + if (changes) { + printfQuda("For gtest, various defaults are changed:\n"); + printfQuda(" --compute-fat-long true\n"); + printfQuda(" --tol (1e-6 for double, 1e-5 for single)\n"); + printfQuda(" --tol-hq (1e-6 for double, 1e-5 for single)\n"); + printfQuda(" --niter 1000\n"); } } From 4837bc64f6726afe62c99146efa5db374088a00c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 6 Dec 2023 22:14:38 -0800 Subject: [PATCH 27/53] Added verify functions for eigenvectors and singular vectors --- tests/host_reference/dslash_reference.cpp | 79 +++++++++++++++++++++++ tests/host_reference/dslash_reference.h | 6 ++ tests/staggered_eigensolve_test.cpp | 12 ++++ 3 files changed, 97 insertions(+) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index b2688b0094..21b42247ec 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -839,3 +839,82 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda return {l2r_max, hqr_max}; } + +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link) +{ + QudaInvertParam& inv_param = *(eig_param.invert_param); + int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; + bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false); + bool normop = (eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? true : false); + double mass = inv_param.mass; + + // Reverse engineer a "solution_type" to help determine which host dslash needs to be applied + QudaSolutionType sol_type = QUDA_INVALID_SOLUTION; + if (normop) { + if (use_pc) errorQuda("The normal preconditioned staggered op is not supported"); + else sol_type = QUDA_MATDAG_MAT_SOLUTION; + } else { + if (use_pc) sol_type = QUDA_MATPC_SOLUTION; + else sol_type = QUDA_MAT_SOLUTION; + } + + // Create temporary spinors + quda::ColorSpinorParam csParam(spinor); + quda::ColorSpinorField ref(csParam); + quda::ColorSpinorField tmp(csParam); + + if (sol_type == QUDA_MAT_SOLUTION) { + stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type); + } else if (sol_type == QUDA_MATPC_SOLUTION) { + QudaParity parity = QUDA_INVALID_PARITY; + switch (inv_param.matpc_type) { + case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; + case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; + default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; + } + stag_matpc(ref, fat_link, long_link, spinor, mass, 0, tmp, parity, dslash_type); + } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) { + stag_mat(tmp, fat_link, long_link, spinor, mass, dagger, dslash_type); + stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); + } + + // Compute M * x - \lambda * x + caxpy(-lambda, spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double l2r = sqrt(nrm2 / src2); + + printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r); + + return l2r; +} + +double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link) +{ + QudaInvertParam& inv_param = *(eig_param.invert_param); + int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; + bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false); + double mass = inv_param.mass; + + if (use_pc) + errorQuda("The SVD of the preconditioned staggered op is not supported"); + + // Create temporary spinors + quda::ColorSpinorParam csParam(spinor_left); + quda::ColorSpinorField ref(csParam); + + // Only `mat` is used here + stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type); + + // Compute M * x_left - \sigma * x_right + caxpy(-sigma, spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double src2 = norm_2(spinor_left.data(), spinor_left.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double l2r = sqrt(nrm2 / src2); + + printfQuda("Singular vector pair %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r); + + return l2r; +} diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 85fc096ff6..6331fbb65a 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -117,6 +117,12 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param); +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); + +double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); + // i represents a "half index" into an even or odd "half lattice". // when oddBit={0,1} the half lattice is {even,odd}. // diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index aab4a99c1f..d71bb7cfb2 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -223,6 +223,18 @@ std::vector eigensolve(test_t test_param) // ... std::vector residua(eig_n_conv, 0.0); + // Perform host side verification of eigenvector if requested. + if (verify_results) { + for (int i = 0; i < eig_n_conv; i++) { + if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) { + double _Complex sigma = evals[i]; + residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP, cpuLongQDP); + } else { + double _Complex lambda = evals[i]; + residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP); + } + } + } return residua; // QUDA eigensolver test COMPLETE //---------------------------------------------------------------------------- From 30eb5eec81d79e2af2abb63c9a0e7663edc72704 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 6 Dec 2023 22:55:05 -0800 Subject: [PATCH 28/53] Added a ctest for staggered eigensolves, fixed the verify function --- tests/CMakeLists.txt | 44 ++++++++++++++++++++++- tests/staggered_eigensolve_test_gtest.hpp | 3 +- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3955de2cb1..923df3d04c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1193,7 +1193,7 @@ foreach(prec IN LISTS TEST_PRECS) endif() endforeach(prec) -# Eigensolves +# Wilson-type eigensolves foreach(prec IN LISTS TEST_PRECS) if(${prec} STREQUAL "double") @@ -1360,6 +1360,48 @@ foreach(prec IN LISTS TEST_PRECS) endif() endforeach(prec) +# Staggered-type eigensolves +foreach(prec IN LISTS TEST_PRECS) + + # These require looser tolerances to keep iterations to solution in check + if(${prec} STREQUAL "double") + set(tol 1e-6) + elseif(${prec} STREQUAL "single") + set(tol 1e-5) + endif() + + if(QUDA_DIRAC_STAGGERED) + # --compute-fat-long true is necessary to get well-behaved fields + + add_test(NAME eigensolve_test_staggered_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type staggered --compute-fat-long true + --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256 + --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 + --enable-testing true + --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) + + add_test(NAME eigensolve_test_asqtad_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type asqtad --compute-fat-long true + --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256 + --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 + --enable-testing true + --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) + + if (QUDA_LAPLACE) + add_test(NAME eigensolve_test_laplace_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type laplace --compute-fat-long true + --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256 + --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 + --enable-testing true + --gtest_output=xml:staggered_eigensolve_test_laplace_${prec}.xml) + endif() + endif() +endforeach(prec) + + if(QUDA_DIRAC_STAGGERED) add_test(NAME hisq_stencil COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp index 06cf712e07..5c5f8e3890 100644 --- a/tests/staggered_eigensolve_test_gtest.hpp +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -117,7 +117,8 @@ TEST_P(StaggeredEigensolveTest, verify) // The solution to avoid this is to use a Krylov space (eig-n-kr) about 3-4 times the // size of the search space (eig-n-ev), or use a well chosen Chebyshev polynomial, // or use a tighter than necessary tolerance. - if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10; + auto eig_type = ::testing::get<0>(GetParam()); + if (eig_type == QUDA_EIG_IR_ARNOLDI || eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10; auto tol = factor * eig_param.tol; for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol); } From e161cdc1b5193f86fbfe2180d44c2e7915756544 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 7 Dec 2023 09:05:27 -0800 Subject: [PATCH 29/53] All sorts of cleanup, moved various is_*_[solve/solution/etc] routines into host utils --- tests/eigensolve_test_gtest.hpp | 11 -- tests/host_reference/dslash_reference.cpp | 2 +- tests/invert_test_gtest.hpp | 72 +------------ tests/staggered_dslash_ctest.cpp | 10 +- tests/staggered_dslash_test.cpp | 8 +- tests/staggered_dslash_test_utils.h | 8 +- tests/staggered_eigensolve_test.cpp | 6 +- tests/staggered_eigensolve_test_gtest.hpp | 4 +- tests/staggered_invert_test.cpp | 8 +- tests/staggered_invert_test_gtest.hpp | 89 ++-------------- tests/utils/host_utils.cpp | 121 +++++++++++++++++++++- tests/utils/host_utils.h | 19 +++- tests/utils/staggered_host_utils.cpp | 2 +- 13 files changed, 170 insertions(+), 190 deletions(-) diff --git a/tests/eigensolve_test_gtest.hpp b/tests/eigensolve_test_gtest.hpp index cd07ca401f..a872963413 100644 --- a/tests/eigensolve_test_gtest.hpp +++ b/tests/eigensolve_test_gtest.hpp @@ -11,17 +11,6 @@ class EigensolveTest : public ::testing::TestWithParam EigensolveTest() : param(GetParam()) { } }; -bool is_chiral(QudaDslashType type) -{ - switch (type) { - case QUDA_DOMAIN_WALL_DSLASH: - case QUDA_DOMAIN_WALL_4D_DSLASH: - case QUDA_MOBIUS_DWF_DSLASH: - case QUDA_MOBIUS_DWF_EOFA_DSLASH: return true; - default: return false; - } -} - bool skip_test(test_t param) { // dwf-style solves must use a normal solver diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 21b42247ec..0b461076fd 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -806,7 +806,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); // correct for the massRescale function inside invertQuda - if (dslash_type == QUDA_LAPLACE_DSLASH) + if (is_laplace(dslash_type)) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { QudaParity parity = QUDA_INVALID_PARITY; diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp index 27c9c873f1..c5b71ead80 100644 --- a/tests/invert_test_gtest.hpp +++ b/tests/invert_test_gtest.hpp @@ -16,79 +16,11 @@ class InvertTest : public ::testing::TestWithParam InvertTest() : param(GetParam()) { } }; -bool is_normal_residual(QudaInverterType type) -{ - switch (type) { - case QUDA_CGNR_INVERTER: - case QUDA_CA_CGNR_INVERTER: return true; - default: return false; - } -} - -bool is_preconditioned_solve(QudaSolveType type) -{ - switch (type) { - case QUDA_DIRECT_PC_SOLVE: - case QUDA_NORMOP_PC_SOLVE: return true; - default: return false; - } -} - -bool is_full_solution(QudaSolutionType type) -{ - switch (type) { - case QUDA_MAT_SOLUTION: - case QUDA_MATDAG_MAT_SOLUTION: return true; - default: return false; - } -} - -bool is_normal_solve(test_t param) -{ - auto inv_type = ::testing::get<0>(param); - auto solve_type = ::testing::get<2>(param); - - switch (solve_type) { - case QUDA_NORMOP_SOLVE: - case QUDA_NORMOP_PC_SOLVE: return true; - default: - switch (inv_type) { - case QUDA_CGNR_INVERTER: - case QUDA_CGNE_INVERTER: - case QUDA_CA_CGNR_INVERTER: - case QUDA_CA_CGNE_INVERTER: return true; - default: return false; - } - } -} - -bool is_chiral(QudaDslashType type) -{ - switch (type) { - case QUDA_DOMAIN_WALL_DSLASH: - case QUDA_DOMAIN_WALL_4D_DSLASH: - case QUDA_MOBIUS_DWF_DSLASH: - case QUDA_MOBIUS_DWF_EOFA_DSLASH: return true; - default: return false; - } -} - -bool support_solution_accumulator_pipeline(QudaInverterType type) -{ - switch (type) { - case QUDA_CG_INVERTER: - case QUDA_CA_CG_INVERTER: - case QUDA_CGNR_INVERTER: - case QUDA_CGNE_INVERTER: - case QUDA_PCG_INVERTER: return true; - default: return false; - } -} - bool skip_test(test_t param) { auto inverter_type = ::testing::get<0>(param); auto solution_type = ::testing::get<1>(param); + auto solve_type = ::testing::get<2>(param); auto prec_sloppy = ::testing::get<3>(param); auto multishift = ::testing::get<4>(param); auto solution_accumulator_pipeline = ::testing::get<5>(param); @@ -102,7 +34,7 @@ bool skip_test(test_t param) if (prec_sloppy < prec_precondition) return true; // sloppy precision >= preconditioner precision // dwf-style solves must use a normal solver - if (is_chiral(dslash_type) && !is_normal_solve(param)) return true; + if (is_chiral(dslash_type) && !is_normal_solve(inverter_type, solve_type)) return true; // FIXME this needs to be added to dslash_reference.cpp if (is_chiral(dslash_type) && multishift > 1) return true; // FIXME this needs to be added to dslash_reference.cpp diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index 28a6a48141..c035013568 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -23,7 +23,7 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) + if (is_laplace(dslash_type) && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) return true; const std::array partition_enabled {true, true, true, false, true, false, false, false, @@ -123,12 +123,12 @@ int main(int argc, char **argv) // Only these fermions are supported in this file if (is_laplace_enabled) { - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (dslash_type == QUDA_LAPLACE_DSLASH) + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } @@ -146,7 +146,7 @@ int main(int argc, char **argv) eps_naik = 0.0; // to avoid potential headaches } - if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat) + if (is_laplace(dslash_type) && dtest_type != dslash_test_type::Mat) errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str()); int test_rc = RUN_ALL_TESTS(); diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index 2883d29b64..0beb48f887 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -86,12 +86,12 @@ int main(int argc, char **argv) // Only these fermions are supported in this file if (is_laplace_enabled) { - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (dslash_type == QUDA_LAPLACE_DSLASH) + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } @@ -109,7 +109,7 @@ int main(int argc, char **argv) eps_naik = 0.0; // to avoid potential headaches } - if (dslash_type == QUDA_LAPLACE_DSLASH && dtest_type != dslash_test_type::Mat) + if (is_laplace(dslash_type) && dtest_type != dslash_test_type::Mat) errorQuda("Test type %s is not supported for the Laplace operator", get_string(dtest_type_map, dtest_type).c_str()); int test_rc = RUN_ALL_TESTS(); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 39bdc09c7b..68b3c676b8 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -245,7 +245,7 @@ struct StaggeredDslashTestWrapper { cpuLong = GaugeField(cpuLongParam); // Override link reconstruct as appropriate for staggered or asqtad - if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) { + if (is_staggered(dslash_type)) { if (link_recon == QUDA_RECONSTRUCT_12) link_recon = QUDA_RECONSTRUCT_13; if (link_recon == QUDA_RECONSTRUCT_8) link_recon = QUDA_RECONSTRUCT_9; } @@ -342,12 +342,12 @@ struct StaggeredDslashTestWrapper { host_timer.start(); - if (dslash_type == QUDA_LAPLACE_DSLASH) { + if (is_laplace(dslash_type)) { switch (dtest_type) { case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break; default: errorQuda("Test type %d not defined on Laplace operator", static_cast(dtest_type)); } - } else { + } else if (is_staggered(dslash_type)) { switch (dtest_type) { case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break; case dslash_test_type::MatPC: dirac->M(cudaSpinorOut, cudaSpinor); break; @@ -355,6 +355,8 @@ struct StaggeredDslashTestWrapper { case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinor); break; default: errorQuda("Test type %d not defined on staggered dslash", static_cast(dtest_type)); } + } else { + errorQuda("Invalid dslash type %d", dslash_type); } host_timer.stop(); diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index d71bb7cfb2..70d70a5b77 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -274,12 +274,12 @@ int main(int argc, char **argv) // Only these fermions are supported in this file if (is_laplace_enabled) { - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (dslash_type == QUDA_LAPLACE_DSLASH) + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp index 5c5f8e3890..376651c447 100644 --- a/tests/staggered_eigensolve_test_gtest.hpp +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -37,7 +37,7 @@ bool skip_test(test_t test_param) // matpc // this is only legal for the staggered and asqtad op - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + if (!is_staggered(dslash_type)) return true; // we can only compute the real part for Lanczos, and real or magnitude for Arnoldi @@ -55,7 +55,7 @@ bool skip_test(test_t test_param) // matdag_mat // this is only legal for the staggered and asqtad op - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + if (!is_staggered(dslash_type)) return true; switch (eig_type) { diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 8f4236d16c..1c941a59a9 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -466,18 +466,18 @@ int main(int argc, char **argv) // Only these fermions are supported in this file if (is_laplace_enabled) { - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) + if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (dslash_type == QUDA_LAPLACE_DSLASH) + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } // Need to add support for LAPLACE MG? if (inv_multigrid) { - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) { + if (!is_staggered(dslash_type)) { errorQuda("dslash_type %s not supported for multigrid preconditioner\n", get_dslash_str(dslash_type)); } } diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 4617c1e85d..27369b4a2f 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -16,73 +16,6 @@ class StaggeredInvertTest : public ::testing::TestWithParam StaggeredInvertTest() : param(GetParam()) { } }; -bool is_hermitian_solver(QudaInverterType type) -{ - switch(type) { - case QUDA_CG_INVERTER: - case QUDA_CA_CG_INVERTER: return true; - default: return false; - } -} - -bool is_normal_residual(QudaInverterType type) -{ - switch (type) { - case QUDA_CGNR_INVERTER: - case QUDA_CA_CGNR_INVERTER: return true; - default: return false; - } -} - -bool is_preconditioned_solve(QudaSolveType type) -{ - switch (type) { - case QUDA_DIRECT_PC_SOLVE: - case QUDA_NORMOP_PC_SOLVE: return true; - default: return false; - } -} - -bool is_full_solution(QudaSolutionType type) -{ - switch (type) { - case QUDA_MAT_SOLUTION: - case QUDA_MATDAG_MAT_SOLUTION: return true; - default: return false; - } -} - -bool is_normal_solve(test_t param) -{ - auto inv_type = ::testing::get<0>(param); - auto solve_type = ::testing::get<2>(param); - - switch (solve_type) { - case QUDA_NORMOP_SOLVE: - case QUDA_NORMOP_PC_SOLVE: return true; - default: - switch (inv_type) { - case QUDA_CGNR_INVERTER: - case QUDA_CGNE_INVERTER: - case QUDA_CA_CGNR_INVERTER: - case QUDA_CA_CGNE_INVERTER: return true; - default: return false; - } - } -} - -bool support_solution_accumulator_pipeline(QudaInverterType type) -{ - switch (type) { - case QUDA_CG_INVERTER: - case QUDA_CA_CG_INVERTER: - case QUDA_CGNR_INVERTER: - case QUDA_CGNE_INVERTER: - case QUDA_PCG_INVERTER: return true; - default: return false; - } -} - bool skip_test(test_t param) { auto inverter_type = ::testing::get<0>(param); @@ -106,12 +39,12 @@ bool skip_test(test_t param) //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) // if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true; - if (dslash_type == QUDA_LAPLACE_DSLASH) { + if (is_laplace(dslash_type)) { if (multishift > 1) return true; // Laplace doesn't support multishift if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE) return true; // Laplace only supports direct solves } - if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_ASQTAD_DSLASH) { + if (is_staggered(dslash_type)) { // the staggered and asqtad operators aren't HPD if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type)) return true; @@ -131,8 +64,6 @@ TEST_P(StaggeredInvertTest, verify) { if (skip_test(GetParam())) GTEST_SKIP(); - auto tol_backup = tol; - inv_param.tol = 0.0; inv_param.tol_hq = 0.0; auto res_t = ::testing::get<7>(GetParam()); @@ -143,18 +74,21 @@ TEST_P(StaggeredInvertTest, verify) auto solution_type = ::testing::get<1>(param); auto solve_type = ::testing::get<2>(param); + // Make a local copy of "tol" for modification in place + auto verify_tol = tol; + // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this // The mass squared is a proxy for the condition number - if (is_normal_residual(inverter_type)) tol /= (0.25 * mass * mass); + if (is_normal_residual(inverter_type)) verify_tol /= (0.25 * mass * mass); // To solve the direct operator to a given tolerance, grind the preconditioned // operator to 0.5 * mass * tol... to keep the target tolerance in inv_param // in check, we shift the requirement to the verified tolerance instead. if (solution_type == QUDA_MAT_SOLUTION) { if (solve_type == QUDA_DIRECT_PC_SOLVE) - tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps + verify_tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps if (solve_type == QUDA_NORMOP_SOLVE) - tol /= (0.5 * mass); // a proxy for the condition number + verify_tol /= (0.5 * mass); // a proxy for the condition number } // The power iterations method of determining the Chebyshev window @@ -163,17 +97,16 @@ TEST_P(StaggeredInvertTest, verify) if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER) inv_param.ca_basis = QUDA_POWER_BASIS; - // Single precision needs a tiny bump + // Single precision needs a tiny bump due to small host/device precision deviations if (prec == QUDA_SINGLE_PRECISION) - tol *= 1.01; + verify_tol *= 1.01; for (auto rsd : solve(GetParam())) { - if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], tol); } + if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], verify_tol); } if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) { EXPECT_LE(rsd[1], tol_hq); } } inv_param.ca_basis = ca_basis_tmp; - tol = tol_backup; } std::string gettestname(::testing::TestParamInfo param) diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 70aea9cdc2..f17622499b 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -231,7 +231,7 @@ void constructWilsonTestSpinorParam(quda::ColorSpinorParam *cs_param, const Quda } cs_param->pc_type = inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC; for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d]; - bool pc = isPCSolution(inv_param->solution_type); + bool pc = is_pc_solution(inv_param->solution_type); if (pc) cs_param->x[0] /= 2; cs_param->siteSubset = pc ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET; @@ -257,15 +257,130 @@ void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision p param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; param.nDim = nDim; param.pc_type = QUDA_4D_PC; - param.siteSubset = isPCSolution(sol_type) ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET; + param.siteSubset = is_pc_solution(sol_type) ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; param.location = QUDA_CPU_FIELD_LOCATION; // DMH FIXME so one can construct device noise for (int d = 0; d < nDim; d++) param.x[d] = x[d]; - if (isPCSolution(sol_type)) param.x[0] /= 2; + if (is_pc_solution(sol_type)) param.x[0] /= 2; quda::ColorSpinorField spinor_in(param); quda::spinorNoise(spinor_in, rng, QUDA_NOISE_UNIFORM); } +// Helper functions +bool is_pc_solution(QudaSolutionType type) +{ + switch (type) { + case QUDA_MATPC_SOLUTION: + case QUDA_MATPC_DAG_SOLUTION: + case QUDA_MATPCDAG_MATPC_SOLUTION: + case QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION: return true; + default: return false; + } +} + +bool is_full_solution(QudaSolutionType type) +{ + switch (type) { + case QUDA_MAT_SOLUTION: + case QUDA_MATDAG_MAT_SOLUTION: return true; + default: return false; + } +} + +bool is_full_solve(QudaSolveType type) +{ + switch (type) { + case QUDA_DIRECT_SOLVE: + case QUDA_NORMOP_SOLVE: + case QUDA_NORMERR_SOLVE: return true; + default: return false; + } +} + +bool is_preconditioned_solve(QudaSolveType type) +{ + switch (type) { + case QUDA_DIRECT_PC_SOLVE: + case QUDA_NORMOP_PC_SOLVE: + case QUDA_NORMERR_PC_SOLVE: return true; + default: return false; + } +} + +bool is_normal_solve(QudaInverterType inv_type, QudaSolveType solve_type) +{ + switch (solve_type) { + case QUDA_NORMOP_SOLVE: + case QUDA_NORMOP_PC_SOLVE: return true; + default: + switch (inv_type) { + case QUDA_CGNR_INVERTER: + case QUDA_CGNE_INVERTER: + case QUDA_CA_CGNR_INVERTER: + case QUDA_CA_CGNE_INVERTER: return true; + default: return false; + } + } +} + +bool is_hermitian_solver(QudaInverterType type) +{ + switch(type) { + case QUDA_CG_INVERTER: + case QUDA_CA_CG_INVERTER: return true; + default: return false; + } +} + +bool support_solution_accumulator_pipeline(QudaInverterType type) +{ + switch (type) { + case QUDA_CG_INVERTER: + case QUDA_CA_CG_INVERTER: + case QUDA_CGNR_INVERTER: + case QUDA_CGNE_INVERTER: + case QUDA_PCG_INVERTER: return true; + default: return false; + } +} + +bool is_normal_residual(QudaInverterType type) +{ + switch (type) { + case QUDA_CGNR_INVERTER: + case QUDA_CA_CGNR_INVERTER: return true; + default: return false; + } +} + +bool is_staggered(QudaDslashType type) +{ + switch (type) { + case QUDA_STAGGERED_DSLASH: + case QUDA_ASQTAD_DSLASH: return true; + default: return false; + } +} + +bool is_chiral(QudaDslashType type) +{ + switch (type) { + case QUDA_DOMAIN_WALL_DSLASH: + case QUDA_DOMAIN_WALL_4D_DSLASH: + case QUDA_MOBIUS_DWF_DSLASH: + case QUDA_MOBIUS_DWF_EOFA_DSLASH: return true; + default: return false; + } +} + +bool is_laplace(QudaDslashType type) +{ + switch (type) { + case QUDA_LAPLACE_DSLASH: return true; + default: return false; + } +} + void initComms(int argc, char **argv, std::array &commDims) { initComms(argc, argv, commDims.data()); } #if defined(QMP_COMMS) || defined(MPI_COMMS) diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index f5276e26f1..46ba4e715f 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -117,11 +117,20 @@ void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision p // Helper functions //------------------------------------------------------ -inline bool isPCSolution(QudaSolutionType solution_type) -{ - return (solution_type == QUDA_MATPC_SOLUTION || solution_type == QUDA_MATPC_DAG_SOLUTION - || solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); -} +bool is_pc_solution(QudaSolutionType solution_type); +bool is_full_solution(QudaSolutionType type); + +bool is_preconditioned_solve(QudaSolveType type); +bool is_normal_solve(QudaInverterType inv_type, QudaSolveType solve_type); + +bool is_hermitian_solver(QudaInverterType type); +bool support_solution_accumulator_pipeline(QudaInverterType type); +bool is_normal_residual(QudaInverterType type); + +bool is_staggered(QudaDslashType type); +bool is_chiral(QudaDslashType type); +bool is_laplace(QudaDslashType type); + //------------------------------------------------------ // Reports basic statistics of flops and solver iterations diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index 95efb0d85a..fcc0b1d697 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -815,7 +815,7 @@ void constructStaggeredTestSpinorParam(quda::ColorSpinorParam *cs_param, const Q cs_param->nSpin = 1; cs_param->nDim = 4; for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d]; - bool pc = isPCSolution(inv_param->solution_type); + bool pc = is_pc_solution(inv_param->solution_type); if (pc) cs_param->x[0] /= 2; cs_param->pc_type = QUDA_4D_PC; cs_param->siteSubset = pc ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET; From bf4eaad79bdecfd678a308ecd790b8a27efc1a06 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 7 Dec 2023 10:41:25 -0800 Subject: [PATCH 30/53] Wilson-type compile fix --- tests/invert_test_gtest.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp index c5b71ead80..47cc230fe2 100644 --- a/tests/invert_test_gtest.hpp +++ b/tests/invert_test_gtest.hpp @@ -42,7 +42,7 @@ bool skip_test(test_t param) // Skip if the inverter does not support batched update and batched update is greater than one if (!support_solution_accumulator_pipeline(inverter_type) && solution_accumulator_pipeline > 1) return true; // MdagMLocal only support for Mobius at present - if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) { + if (is_normal_solve(inverter_type, solve_type) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) { #ifdef QUDA_MMA_AVAILABLE if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true; #else From b4300ccc8db11c8938cf375aa21d605bb57ea016 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 7 Dec 2023 11:29:44 -0800 Subject: [PATCH 31/53] Changed dwf tolerance check to use is_chiral --- tests/invert_test_gtest.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp index 0bbc4f6926..dca4bc5e9a 100644 --- a/tests/invert_test_gtest.hpp +++ b/tests/invert_test_gtest.hpp @@ -69,10 +69,7 @@ TEST_P(InvertTest, verify) if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq; auto tol = inv_param.tol; - if (inv_param.dslash_type == QUDA_DOMAIN_WALL_DSLASH || - inv_param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || - inv_param.dslash_type == QUDA_MOBIUS_DWF_DSLASH || - inv_param.dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { + if (is_chiral(inv_param.dslash_type)) { tol *= std::sqrt(static_cast(inv_param.Ls)); } // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this From ca2be8d60eb880a5067a7a1f5d9a4e6a96fc7b3f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 11 Dec 2023 17:43:37 -0800 Subject: [PATCH 32/53] Some BiCGStab cleanup, SVD deflation is being quirky --- lib/inv_bicgstab_quda.cpp | 69 ++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 10ec609ec3..2822867667 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -41,7 +41,26 @@ namespace quda { void BiCGstab::operator()(ColorSpinorField &x, ColorSpinorField &b) { - profile.TPSTART(QUDA_PROFILE_PREAMBLE); + if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT); + + double b2 = blas::norm2(b); // norm sq of source + double r2; // norm sq of residual + + // Check to see that we're not trying to invert on a zero-field source + if (b2 == 0) { + if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + warningQuda("inverting on zero-field source"); + x = b; + param.true_res = 0.0; + param.true_res_hq = 0.0; + profile.TPSTOP(QUDA_PROFILE_INIT); + return; + } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { + b2 = r2; + } else { + errorQuda("Null vector computing requires non-zero guess!"); + } + } if (!init) { ColorSpinorParam csParam(x); @@ -56,9 +75,6 @@ namespace quda { init = true; } - double b2 = blas::norm2(b); // norm sq of source - double r2; // norm sq of residual - if (param.deflate) { // Construct the eigensolver and deflation space if requested. if (param.eig_param.eig_type == QUDA_EIG_TR_LANCZOS || param.eig_param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) { @@ -70,15 +86,15 @@ namespace quda { } if (deflate_compute) { // compute the deflation space. - if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_PREAMBLE); + if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT); (*eig_solve)(evecs, evals); + if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT); if (param.deflate) { // double the size of the Krylov space extendSVDDeflationSpace(); // populate extra memory with L/R singular vectors eig_solve->computeSVD(evecs, evals); } - if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_PREAMBLE); deflate_compute = false; } if (recompute_evals) { @@ -108,22 +124,6 @@ namespace quda { r2 = blas::xmyNorm(b, r); } - // Check to see that we're not trying to invert on a zero-field source - if (b2 == 0) { - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { - warningQuda("inverting on zero-field source"); - x = b; - param.true_res = 0.0; - param.true_res_hq = 0.0; - profile.TPSTOP(QUDA_PROFILE_PREAMBLE); - return; - } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { - b2 = r2; - } else { - errorQuda("Null vector computing requires non-zero guess!"); - } - } - // set field aliasing according to whether we are doing mixed precision or not if (param.precision_sloppy == x.Precision()) { r_sloppy = r.create_alias(); @@ -156,6 +156,11 @@ namespace quda { x_sloppy = ColorSpinorField(csParam); } + if (!param.is_preconditioner) { + profile.TPSTOP(QUDA_PROFILE_INIT); + profile.TPSTART(QUDA_PROFILE_PREAMBLE); + } + double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver const bool use_heavy_quark_res = @@ -184,8 +189,10 @@ namespace quda { PrintStats("BiCGstab", k, r2, b2, heavy_quark_res); - profile.TPSTOP(QUDA_PROFILE_PREAMBLE); - profile.TPSTART(QUDA_PROFILE_COMPUTE); + if (!param.is_preconditioner) { + profile.TPSTOP(QUDA_PROFILE_PREAMBLE); + profile.TPSTART(QUDA_PROFILE_COMPUTE); + } rho = r2; // cDotProductCuda(r0, r_sloppy); // BiCRstab blas::copy(p, r_sloppy); @@ -349,12 +356,14 @@ namespace quda { // y has already been updated blas::copy(x, y); - profile.TPSTOP(QUDA_PROFILE_COMPUTE); - profile.TPSTART(QUDA_PROFILE_EPILOGUE); + if (!param.is_preconditioner) { + profile.TPSTOP(QUDA_PROFILE_COMPUTE); + profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.iter += k; + param.iter += k; - if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); + if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); + } if (getVerbosity() >= QUDA_VERBOSE) printfQuda("BiCGstab: Reliable updates = %d\n", rUpdate); @@ -364,9 +373,9 @@ namespace quda { param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r).z) : 0.0; PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq); - } - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); + profile.TPSTOP(QUDA_PROFILE_EPILOGUE); + } } From 6bcbdab3c23fdf7c2bcd7c8007c02a687a2fbb9c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 12 Dec 2023 10:35:29 -0800 Subject: [PATCH 33/53] Added an asqtad splitgrid test to probe loading both fat and fat+long links appropriately, fixed a staggered split grid bug --- lib/interface_quda.cpp | 80 ++++++++++++++++++----------- tests/CMakeLists.txt | 42 ++++++++++----- tests/staggered_dslash_test_utils.h | 2 +- 3 files changed, 80 insertions(+), 44 deletions(-) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index c9576f6f01..582f5031f2 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3010,7 +3010,7 @@ void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_para template void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // color spinor field pointers, and inv_param void *h_gauge, void *milc_fatlinks, void *milc_longlinks, - QudaGaugeParam *gauge_param, // gauge field pointers + QudaGaugeParam *gauge_param_, // gauge field pointers void *h_clover, void *h_clovinv, // clover field pointers Interface op, Args... args) { @@ -3030,14 +3030,19 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]); } + // Create a local copy of gauge_param that we can modify without perturbing + // the original one + if (!gauge_param_) + errorQuda("Input gauge_param is null"); + + QudaGaugeParam gauge_param = *gauge_param_; + if (num_sub_partition == 1) { // In this case we don't split the grid. for (int n = 0; n < param->num_src; n++) { op(_hp_x[n], _hp_b[n], param, args...); } } else { - if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr"); } - // Doing the sub-partition arithmatics if (param->num_src_per_sub_partition * num_sub_partition != param->num_src) { errorQuda("We need to have split_grid[0](=%d) * split_grid[1](=%d) * split_grid[2](=%d) * split_grid[3](=%d) * " @@ -3054,14 +3059,19 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col checkInvertParam(param, _hp_x[0], _hp_b[0]); - bool is_staggered; + bool is_staggered = false; + bool is_asqtad = false; if (h_gauge) { is_staggered = false; } else if (milc_fatlinks) { is_staggered = true; + if (param->dslash_type == QUDA_ASQTAD_DSLASH) { + if (!milc_longlinks) + errorQuda("milc_longlinks is null for an asqtad dslash"); + is_asqtad = true; + } } else { errorQuda("Both h_gauge and milc_fatlinks are null."); - is_staggered = true; // to suppress compiler warning/error. } // Gauge fields/params @@ -3075,23 +3085,28 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // set up the gauge field params. if (!is_staggered) { // not staggered - gf_param = new GaugeFieldParam(*gauge_param, h_gauge); + gf_param = new GaugeFieldParam(gauge_param, h_gauge); if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; in = GaugeField::Create(*gf_param); } else { // staggered - milc_fatlink_param = new GaugeFieldParam(*gauge_param, milc_fatlinks); + milc_fatlink_param = new GaugeFieldParam(gauge_param, milc_fatlinks); if (milc_fatlink_param->order <= 4) milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; + milc_fatlink_param->order = QUDA_MILC_GAUGE_ORDER; milc_fatlink_field = GaugeField::Create(*milc_fatlink_param); - milc_longlink_param = new GaugeFieldParam(*gauge_param, milc_longlinks); - if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; - milc_longlink_field = GaugeField::Create(*milc_longlink_param); + + if (is_asqtad) { + milc_longlink_param = new GaugeFieldParam(gauge_param, milc_longlinks); + if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; + milc_longlink_param->order = QUDA_MILC_GAUGE_ORDER; + milc_longlink_field = GaugeField::Create(*milc_longlink_param); + } } // Create the temp host side helper fields, which are just wrappers of the input pointers. bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); - lat_dim_t X = {gauge_param->X[0], gauge_param->X[1], gauge_param->X[2], gauge_param->X[3]}; + lat_dim_t X = {gauge_param.X[0], gauge_param.X[1], gauge_param.X[2], gauge_param.X[3]}; ColorSpinorParam cpuParam(_hp_b[0], *param, X, pc_solution, param->input_location); std::vector _h_b(param->num_src); for (int i = 0; i < param->num_src; i++) { @@ -3119,12 +3134,12 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col gf_param->pad *= split_key[d]; } else { milc_fatlink_param->x[d] *= split_key[d]; - milc_fatlink_param->pad *= split_key[d]; - milc_longlink_param->x[d] *= split_key[d]; - milc_longlink_param->pad *= split_key[d]; + //milc_fatlink_param->pad *= split_key[d]; + if (is_asqtad) milc_longlink_param->x[d] *= split_key[d]; + //milc_longlink_param->pad *= split_key[d]; } - gauge_param->X[d] *= split_key[d]; - gauge_param->ga_pad *= split_key[d]; + gauge_param.X[d] *= split_key[d]; + if (!is_staggered) gauge_param.ga_pad *= split_key[d]; } // Deal with clover field. For Multi source computatons, clover field construction is done @@ -3178,15 +3193,19 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col v_g[0] = in; quda::split_field(*collected_gauge, v_g, split_key); } else { + std::vector v_g(1); + milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE; - milc_longlink_param->create = QUDA_NULL_FIELD_CREATE; collected_milc_fatlink_field = new quda::GaugeField(*milc_fatlink_param); - collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param); - std::vector v_g(1); v_g[0] = milc_fatlink_field; quda::split_field(*collected_milc_fatlink_field, v_g, split_key); - v_g[0] = milc_longlink_field; - quda::split_field(*collected_milc_longlink_field, v_g, split_key); + + if (is_asqtad) { + milc_longlink_param->create = QUDA_NULL_FIELD_CREATE; + collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param); + v_g[0] = milc_longlink_field; + quda::split_field(*collected_milc_longlink_field, v_g, split_key); + } } profileInvertMultiSrc.TPSTART(QUDA_PROFILE_PREAMBLE); @@ -3219,9 +3238,9 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // the split topology. logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n"); if (!is_staggered) { - loadGaugeQuda(collected_gauge->raw_pointer(), gauge_param); + loadGaugeQuda(collected_gauge->raw_pointer(), &gauge_param); } else { - loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->raw_pointer(), + loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field->raw_pointer(), collected_milc_longlink_field->raw_pointer()); } logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n"); @@ -3247,8 +3266,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col comm_barrier(); for (int d = 0; d < CommKey::n_dim; d++) { - gauge_param->X[d] /= split_key[d]; - gauge_param->ga_pad /= split_key[d]; + gauge_param.X[d] /= split_key[d]; + if (!is_staggered) gauge_param.ga_pad /= split_key[d]; } for (int n = 0; n < param->num_src_per_sub_partition; n++) { @@ -3269,9 +3288,12 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col delete collected_gauge; } else { delete milc_fatlink_field; - delete milc_longlink_field; delete collected_milc_fatlink_field; - delete collected_milc_longlink_field; + + if (is_asqtad) { + delete milc_longlink_field; + delete collected_milc_longlink_field; + } } if (input_clover) { delete input_clover; } @@ -3281,10 +3303,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // Restore the gauge field if (!is_staggered) { - loadGaugeQuda(h_gauge, gauge_param); + loadGaugeQuda(h_gauge, &gauge_param); } else { freeGaugeQuda(); - loadFatLongGaugeQuda(param, gauge_param, milc_fatlinks, milc_longlinks); + loadFatLongGaugeQuda(param, &gauge_param, milc_fatlinks, milc_longlinks); } if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 923df3d04c..cb87adf27f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -871,7 +871,7 @@ endif() add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dslash-type ${DIRAC_NAME} - --test 0 + --test MatPC --dim 20 20 20 20 --gtest_output=json:dslash_${DIRAC_NAME}_benchmark_pol${pol2}.json --gtest_filter=*benchmark/*n0) @@ -907,7 +907,7 @@ endif() COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dslash-type ${DIRAC_NAME} --all-partitions 1 - --test 0 + --test MatPC --dim 20 20 20 20 --gtest_output=json:dslash_${DIRAC_NAME}_benchmark_pol${pol2}.json --gtest_filter=*benchmark/*n0) @@ -1161,19 +1161,19 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --gtest_output=xml:invert_test_staggered_${prec}.xml) - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_staggered_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml) - - set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() + if(DEFINED ENV{QUDA_ENABLE_TUNING}) + if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) + add_test(NAME invert_test_splitgrid_staggered_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true + --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml) + + set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) endif() + endif() add_test(NAME invert_test_asqtad_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} @@ -1182,6 +1182,20 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --gtest_output=xml:invert_test_asqtad_${prec}.xml) + if(DEFINED ENV{QUDA_ENABLE_TUNING}) + if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) + add_test(NAME invert_test_splitgrid_asqtad_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true + --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml) + + set_tests_properties(invert_test_splitgrid_asqtad_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) + endif() + endif() + if (QUDA_LAPLACE) add_test(NAME invert_test_laplace_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 68b3c676b8..e7eb39b07f 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -333,7 +333,7 @@ struct StaggeredDslashTestWrapper { _hp_x[i] = vp_spinor_out[i].data(); _hp_b[i] = vp_spinor[i].data(); } - dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, qdp_fatlink, qdp_longlink, + dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink, milc_longlink, &gauge_param); } else { From 67b5ef37242363fc7b6b2b68552d500312e688ba Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 12 Dec 2023 14:09:42 -0800 Subject: [PATCH 34/53] Further split grid cleanup, some tolerance fixes --- lib/interface_quda.cpp | 61 +++++++++-------------- tests/CMakeLists.txt | 4 +- tests/staggered_eigensolve_test.cpp | 4 +- tests/staggered_eigensolve_test_gtest.hpp | 7 +++ 4 files changed, 35 insertions(+), 41 deletions(-) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 582f5031f2..b42e4ba806 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3034,7 +3034,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // the original one if (!gauge_param_) errorQuda("Input gauge_param is null"); - QudaGaugeParam gauge_param = *gauge_param_; if (num_sub_partition == 1) { // In this case we don't split the grid. @@ -3078,10 +3077,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col GaugeFieldParam *gf_param = nullptr; GaugeField *in = nullptr; // Staggered gauge fields/params - GaugeFieldParam *milc_fatlink_param = nullptr; - GaugeFieldParam *milc_longlink_param = nullptr; - GaugeField *milc_fatlink_field = nullptr; - GaugeField *milc_longlink_field = nullptr; + GaugeFieldParam milc_fatlink_param; + GaugeFieldParam milc_longlink_param; + quda::GaugeField milc_fatlink_field; + quda::GaugeField milc_longlink_field; // set up the gauge field params. if (!is_staggered) { // not staggered @@ -3089,16 +3088,14 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; in = GaugeField::Create(*gf_param); } else { // staggered - milc_fatlink_param = new GaugeFieldParam(gauge_param, milc_fatlinks); - if (milc_fatlink_param->order <= 4) milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; - milc_fatlink_param->order = QUDA_MILC_GAUGE_ORDER; - milc_fatlink_field = GaugeField::Create(*milc_fatlink_param); + milc_fatlink_param = GaugeFieldParam(gauge_param, milc_fatlinks); + milc_fatlink_param.order = QUDA_MILC_GAUGE_ORDER; + milc_fatlink_field = GaugeField(milc_fatlink_param); if (is_asqtad) { - milc_longlink_param = new GaugeFieldParam(gauge_param, milc_longlinks); - if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; - milc_longlink_param->order = QUDA_MILC_GAUGE_ORDER; - milc_longlink_field = GaugeField::Create(*milc_longlink_param); + milc_longlink_param = GaugeFieldParam(gauge_param, milc_longlinks); + milc_longlink_param.order = QUDA_MILC_GAUGE_ORDER; + milc_longlink_field = GaugeField(milc_longlink_param); } } @@ -3133,10 +3130,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col gf_param->x[d] *= split_key[d]; gf_param->pad *= split_key[d]; } else { - milc_fatlink_param->x[d] *= split_key[d]; - //milc_fatlink_param->pad *= split_key[d]; - if (is_asqtad) milc_longlink_param->x[d] *= split_key[d]; - //milc_longlink_param->pad *= split_key[d]; + milc_fatlink_param.x[d] *= split_key[d]; + if (is_asqtad) milc_longlink_param.x[d] *= split_key[d]; } gauge_param.X[d] *= split_key[d]; if (!is_staggered) gauge_param.ga_pad *= split_key[d]; @@ -3183,8 +3178,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } quda::GaugeField *collected_gauge = nullptr; - quda::GaugeField *collected_milc_fatlink_field = nullptr; - quda::GaugeField *collected_milc_longlink_field = nullptr; + quda::GaugeField collected_milc_fatlink_field; + quda::GaugeField collected_milc_longlink_field; if (!is_staggered) { gf_param->create = QUDA_NULL_FIELD_CREATE; @@ -3195,16 +3190,16 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } else { std::vector v_g(1); - milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE; - collected_milc_fatlink_field = new quda::GaugeField(*milc_fatlink_param); - v_g[0] = milc_fatlink_field; - quda::split_field(*collected_milc_fatlink_field, v_g, split_key); + milc_fatlink_param.create = QUDA_NULL_FIELD_CREATE; + collected_milc_fatlink_field = GaugeField(milc_fatlink_param); + v_g[0] = &milc_fatlink_field; + quda::split_field(collected_milc_fatlink_field, v_g, split_key); if (is_asqtad) { - milc_longlink_param->create = QUDA_NULL_FIELD_CREATE; - collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param); - v_g[0] = milc_longlink_field; - quda::split_field(*collected_milc_longlink_field, v_g, split_key); + milc_longlink_param.create = QUDA_NULL_FIELD_CREATE; + collected_milc_longlink_field = GaugeField(milc_longlink_param); + v_g[0] = &milc_longlink_field; + quda::split_field(collected_milc_longlink_field, v_g, split_key); } } @@ -3240,8 +3235,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col if (!is_staggered) { loadGaugeQuda(collected_gauge->raw_pointer(), &gauge_param); } else { - loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field->raw_pointer(), - collected_milc_longlink_field->raw_pointer()); + loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field.raw_pointer(), + (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr); } logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n"); @@ -3286,14 +3281,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col if (!is_staggered) { delete in; delete collected_gauge; - } else { - delete milc_fatlink_field; - delete collected_milc_fatlink_field; - - if (is_asqtad) { - delete milc_longlink_field; - delete collected_milc_longlink_field; - } } if (input_clover) { delete input_clover; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cb87adf27f..e325e5c6be 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1379,9 +1379,9 @@ foreach(prec IN LISTS TEST_PRECS) # These require looser tolerances to keep iterations to solution in check if(${prec} STREQUAL "double") - set(tol 1e-6) - elseif(${prec} STREQUAL "single") set(tol 1e-5) + elseif(${prec} STREQUAL "single") + set(tol 1e-4) endif() if(QUDA_DIRAC_STAGGERED) diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 70d70a5b77..85136b3972 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -174,7 +174,7 @@ std::vector eigensolve(test_t test_param) if (enable_testing) { eig_use_poly_acc = false; eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE; - eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 4; + eig_batched_rotate != 0 ? eig_param.batched_rotate = eig_batched_rotate : eig_param.batched_rotate = 0; } logQuda(QUDA_SUMMARIZE, "Action = %s, Solver = %s, norm-op = %s, even-odd = %s, with SVD = %s, spectrum = %s\n", @@ -307,7 +307,7 @@ int main(int argc, char **argv) if (eig_tol != expected_tol) { eig_tol = expected_tol; changes = true; } if (niter != 1000) { niter = 1000; changes = true; } if (eig_n_kr != 256) { eig_n_kr = 256; changes = true; } - if (eig_block_size != 8) { eig_block_size = 8; } + if (eig_block_size != 4) { eig_block_size = 4; } if (changes) { printfQuda("For gtest, various defaults are changed:\n"); diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp index 376651c447..ab879a9b37 100644 --- a/tests/staggered_eigensolve_test_gtest.hpp +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -102,6 +102,13 @@ bool skip_test(test_t test_param) } } + // There seems to be some stubborn issue with this combination on 2xGPUs that I can't quite + // comprehend, and am a bit tired to debugging. + //if (prec == QUDA_SINGLE_PRECISION && dslash_type == QUDA_ASQTAD_DSLASH && + // eig_type == QUDA_EIG_BLK_TR_LANCZOS && spectrum == QUDA_SPECTRUM_SR_EIG && + // combo_solve_type == QUDA_DIRECT_PC_SOLVE) + // return true; + return false; } From 19d6f02905feb01e74fc73dec24540ddb878647a Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 12 Dec 2023 15:49:05 -0800 Subject: [PATCH 35/53] Potential logic in (block) trlm related to using a max norm for getting the smallest eigenvalues w/out polynomial acceleration... and fixing an initialized variable issue --- lib/eig_block_trlm.cpp | 14 +++++++------- lib/eig_trlm.cpp | 16 ++++++++-------- lib/inv_bicgstab_quda.cpp | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 160af9dff4..0cf7564440 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -83,7 +83,7 @@ namespace quda checkChebyOpMax(kSpace); // Convergence and locking criteria - double mat_norm = 0.0; + //double mat_norm = 0.0; double epsilon = setEpsilon(kSpace[0].Precision()); // Print Eigensolver params @@ -106,15 +106,15 @@ namespace quda profile.TPSTART(QUDA_PROFILE_COMPUTE); // mat_norm is updated. - for (int i = num_locked; i < n_kr; i++) - if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + //for (int i = num_locked; i < n_kr; i++) + // if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { - if (residua[i + num_locked] < epsilon * mat_norm) { + if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * mat_norm); + epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -125,9 +125,9 @@ namespace quda // Convergence check iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { - if (residua[i + num_locked] < tol * mat_norm) { + if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * mat_norm); + tol * fabs(alpha[i + num_locked]) /*mat_norm*/); iter_converged = i; } else { // Unlikely to find new converged pairs diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 00d3941527..eaea8d8560 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -64,7 +64,7 @@ namespace quda checkChebyOpMax(kSpace); // Convergence and locking criteria - double mat_norm = 0.0; + //double mat_norm = 0.0; double epsilon = setEpsilon(kSpace[0].Precision()); // Print Eigensolver params @@ -87,15 +87,15 @@ namespace quda profile.TPSTART(QUDA_PROFILE_COMPUTE); // mat_norm is updated. - for (int i = num_locked; i < n_kr; i++) - if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + //for (int i = num_locked; i < n_kr; i++) + // if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { - if (residua[i + num_locked] < epsilon * mat_norm) { + if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked])/*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * mat_norm); + epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -106,9 +106,9 @@ namespace quda // Convergence check iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { - if (residua[i + num_locked] < tol * mat_norm) { + if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * mat_norm); + tol * fabs(alpha[i + num_locked]) /*mat_norm*/); iter_converged = i; } else { // Unlikely to find new converged pairs @@ -166,7 +166,7 @@ namespace quda n_conv, restart_iter, iter); // Dump all Ritz values and residua if using Chebyshev - for (int i = 0; i < n_conv && eig_param->use_poly_acc; i++) { + for (int i = 0; i < n_conv /*&& eig_param->use_poly_acc*/; i++) { logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]); } diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 2822867667..3fa5afd849 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -44,7 +44,7 @@ namespace quda { if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT); double b2 = blas::norm2(b); // norm sq of source - double r2; // norm sq of residual + double r2 = 0.0; // norm sq of residual // Check to see that we're not trying to invert on a zero-field source if (b2 == 0) { From 862896eb31af4b8c1bae9bd8c8b9f82916079f40 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 12 Dec 2023 19:26:00 -0800 Subject: [PATCH 36/53] Restored norm behavior for (block)TRLM LR convergence --- lib/eig_block_trlm.cpp | 25 +++++++++++++++++-------- lib/eig_trlm.cpp | 25 +++++++++++++++++-------- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 0cf7564440..18df423ad4 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -83,7 +83,7 @@ namespace quda checkChebyOpMax(kSpace); // Convergence and locking criteria - //double mat_norm = 0.0; + double mat_norm = 0.0; double epsilon = setEpsilon(kSpace[0].Precision()); // Print Eigensolver params @@ -105,16 +105,25 @@ namespace quda eigensolveFromBlockArrowMat(); profile.TPSTART(QUDA_PROFILE_COMPUTE); - // mat_norm is updated. - //for (int i = num_locked; i < n_kr; i++) - // if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + // mat_norm is updated and used for LR + for (int i = num_locked; i < n_kr; i++) + if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + + // Lambda that returns mat_norm for LR and returns the relevant alpha + // (the corresponding Ritz value) for SR + auto check_norm = [&] (double sr_norm) -> double { + if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG) + return mat_norm; + else + return sr_norm; + }; // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { - if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/) { + if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/); + epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -125,9 +134,9 @@ namespace quda // Convergence check iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { - if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) { + if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * fabs(alpha[i + num_locked]) /*mat_norm*/); + tol * check_norm(alpha[i + num_locked]) /*mat_norm*/); iter_converged = i; } else { // Unlikely to find new converged pairs diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index eaea8d8560..99e3b465b5 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -64,7 +64,7 @@ namespace quda checkChebyOpMax(kSpace); // Convergence and locking criteria - //double mat_norm = 0.0; + double mat_norm = 0.0; double epsilon = setEpsilon(kSpace[0].Precision()); // Print Eigensolver params @@ -86,16 +86,25 @@ namespace quda eigensolveFromArrowMat(); profile.TPSTART(QUDA_PROFILE_COMPUTE); - // mat_norm is updated. - //for (int i = num_locked; i < n_kr; i++) - // if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + // mat_norm is updated and used for LR + for (int i = num_locked; i < n_kr; i++) + if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + + // Lambda that returns mat_norm for LR and returns the relevant alpha + // (the corresponding Ritz value) for SR + auto check_norm = [&] (double sr_norm) -> double { + if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG) + return mat_norm; + else + return sr_norm; + }; // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { - if (residua[i + num_locked] < epsilon * fabs(alpha[i + num_locked])/*mat_norm*/) { + if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])/*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * fabs(alpha[i + num_locked]) /*mat_norm*/); + epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -106,9 +115,9 @@ namespace quda // Convergence check iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { - if (residua[i + num_locked] < tol * fabs(alpha[i + num_locked]) /*mat_norm*/) { + if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) { logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * fabs(alpha[i + num_locked]) /*mat_norm*/); + tol * check_norm(alpha[i + num_locked]) /*mat_norm*/); iter_converged = i; } else { // Unlikely to find new converged pairs From 5f901e4416e76b8f5fd564c19df051cd7e74d3c0 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 12 Dec 2023 20:46:38 -0800 Subject: [PATCH 37/53] Updated Wilson bits of split grid to use GaugeField objects as appropriate --- lib/interface_quda.cpp | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index b42e4ba806..7a8c4d8b45 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3074,8 +3074,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } // Gauge fields/params - GaugeFieldParam *gf_param = nullptr; - GaugeField *in = nullptr; + GaugeFieldParam gf_param; + GaugeField in; // Staggered gauge fields/params GaugeFieldParam milc_fatlink_param; GaugeFieldParam milc_longlink_param; @@ -3084,9 +3084,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // set up the gauge field params. if (!is_staggered) { // not staggered - gf_param = new GaugeFieldParam(gauge_param, h_gauge); - if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; - in = GaugeField::Create(*gf_param); + gf_param = GaugeFieldParam(gauge_param, h_gauge); + in = GaugeField(gf_param); } else { // staggered milc_fatlink_param = GaugeFieldParam(gauge_param, milc_fatlinks); milc_fatlink_param.order = QUDA_MILC_GAUGE_ORDER; @@ -3127,8 +3126,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col errorQuda("Split not possible: %2d %% %2d != 0", comm_dim(d), split_key[d]); } if (!is_staggered) { - gf_param->x[d] *= split_key[d]; - gf_param->pad *= split_key[d]; + gf_param.x[d] *= split_key[d]; + gf_param.pad *= split_key[d]; } else { milc_fatlink_param.x[d] *= split_key[d]; if (is_asqtad) milc_longlink_param.x[d] *= split_key[d]; @@ -3177,16 +3176,16 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } } - quda::GaugeField *collected_gauge = nullptr; + quda::GaugeField collected_gauge; quda::GaugeField collected_milc_fatlink_field; quda::GaugeField collected_milc_longlink_field; if (!is_staggered) { - gf_param->create = QUDA_NULL_FIELD_CREATE; - collected_gauge = new quda::GaugeField(*gf_param); + gf_param.create = QUDA_NULL_FIELD_CREATE; + collected_gauge = quda::GaugeField(gf_param); std::vector v_g(1); - v_g[0] = in; - quda::split_field(*collected_gauge, v_g, split_key); + v_g[0] = ∈ + quda::split_field(collected_gauge, v_g, split_key); } else { std::vector v_g(1); @@ -3233,7 +3232,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // the split topology. logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n"); if (!is_staggered) { - loadGaugeQuda(collected_gauge->raw_pointer(), &gauge_param); + loadGaugeQuda(collected_gauge.raw_pointer(), &gauge_param); } else { loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field.raw_pointer(), (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr); @@ -3278,11 +3277,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col for (auto p : _h_x) { delete p; } for (auto p : _h_b) { delete p; } - if (!is_staggered) { - delete in; - delete collected_gauge; - } - if (input_clover) { delete input_clover; } if (collected_clover) { delete collected_clover; } @@ -3290,10 +3284,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // Restore the gauge field if (!is_staggered) { - loadGaugeQuda(h_gauge, &gauge_param); + loadGaugeQuda(h_gauge, gauge_param_); } else { freeGaugeQuda(); - loadFatLongGaugeQuda(param, &gauge_param, milc_fatlinks, milc_longlinks); + loadFatLongGaugeQuda(param, gauge_param_, milc_fatlinks, milc_longlinks); } if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { From aa236f30901997c8075c4adc6546b7f1551cc636 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 13 Dec 2023 10:06:17 -0800 Subject: [PATCH 38/53] WAR for blowing out argument sizes for diluting typical staggered MG nc --- lib/spinor_dilute.in.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu index 9b57458e8a..5472d828b2 100644 --- a/lib/spinor_dilute.in.cu +++ b/lib/spinor_dilute.in.cu @@ -87,7 +87,11 @@ namespace quda const lat_dim_t &local_block, IntList) { if (src.Ncolor() == Nc) { - SpinorDilute(src, v, type, local_block); + if constexpr (Nc <= 32) { + SpinorDilute(src, v, type, local_block); + } else { + errorQuda("nColor = %d is too large to compile, see QUDA issues"); + } } else { if constexpr (sizeof...(N) > 0) spinorDilute(src, v, type, local_block, IntList()); From 48d7a21d4bfaf80322a097e7c1b6b9d749bb9b9c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 26 Dec 2023 06:44:48 -0800 Subject: [PATCH 39/53] Comment cleanup in eigensolver --- lib/eig_block_trlm.cpp | 8 ++++---- lib/eig_trlm.cpp | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 18df423ad4..8a8f90063a 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -121,9 +121,9 @@ namespace quda // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { - if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/) { + if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])) { logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/); + epsilon * check_norm(alpha[i + num_locked])); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -134,9 +134,9 @@ namespace quda // Convergence check iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { - if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) { + if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked])) { logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * check_norm(alpha[i + num_locked]) /*mat_norm*/); + tol * check_norm(alpha[i + num_locked])); iter_converged = i; } else { // Unlikely to find new converged pairs diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 99e3b465b5..2d6f1d2cac 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -102,9 +102,9 @@ namespace quda // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { - if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])/*mat_norm*/) { + if (residua[i + num_locked] < epsilon * check_norm(alpha[i + num_locked])) { logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * check_norm(alpha[i + num_locked]) /*mat_norm*/); + epsilon * check_norm(alpha[i + num_locked])); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -115,9 +115,9 @@ namespace quda // Convergence check iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { - if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked]) /*mat_norm*/) { + if (residua[i + num_locked] < tol * check_norm(alpha[i + num_locked])) { logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * check_norm(alpha[i + num_locked]) /*mat_norm*/); + tol * check_norm(alpha[i + num_locked])); iter_converged = i; } else { // Unlikely to find new converged pairs @@ -175,7 +175,7 @@ namespace quda n_conv, restart_iter, iter); // Dump all Ritz values and residua if using Chebyshev - for (int i = 0; i < n_conv /*&& eig_param->use_poly_acc*/; i++) { + for (int i = 0; i < n_conv; i++) { logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]); } From d2b2372bff209f50a5359eaa0f8220a4d1cc633d Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 26 Dec 2023 07:20:54 -0800 Subject: [PATCH 40/53] doxygen --- tests/host_reference/dslash_reference.h | 48 +++++++++++++++++ .../staggered_dslash_reference.h | 53 ++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 6331fbb65a..4d67134baa 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -109,17 +109,65 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu void *spinorCheck, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv); +/** + * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes + * an array of outputs as is necessary for handling both single- and multi-shift solves. + * + * @param tmp A temporary spinor intermediate calculations + * @param ref A temporary reference field that is used to store the host verification solution + * @param in The initial rhs + * @param out The solution to A out = in + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @param inv_param Invert params, used to query the solve type, etc + * @return The residual and HQ residual (if requested) + */ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param); +/** + * @brief Verify a single- or multi-shift staggered inversion on the host + * + * @param tmp A temporary spinor intermediate calculations + * @param ref A temporary reference field that is used to store the host verification solution + * @param in The initial rhs + * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts + * @return The residual and HQ residual (if requested) + */ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param); +/** + * @brief Verify a staggered-type eigenvector + * + * @param spinor The host eigenvector to be verified + * @param lambda The host eigenvalue to be verified + * @param i The number of the eigenvalue, only used when printing outputs + * @param eig_param Eigensolve params, used to query the operator type, etc + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @return The residual norm + */ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i, QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); +/** + * @brief Verify a staggered-type singular vector + * + * @param spinor The host left singular vector to be verified + * @param spinor_right The host right singular vector to be verified + * @param lambda The host singular value to be verified + * @param i The number of the singular value, only used when printing outputs + * @param eig_param Eigensolve params, used to query the operator type, etc + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @return The residual norm + */ double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i, QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index 9fc6c9d641..b81f0fcb7a 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -11,16 +11,67 @@ using namespace quda; void setDims(int *); +/** + * @brief Base host routine to apply the even-odd or odd-even component of a staggered-type dslash + * + * @tparam real_t Datatype used in the host dslash + * @param res Host output result + * @param fatlink Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param longlink Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param ghostFatlink Ghost zones for the host fat links + * @param ghostLonglink Ghost zones for the host long links + * @param spinorField Host input spinor + * @param fwd_nbr_spinor Forward ghost zones for the host input spinor + * @param back_nbr_spinor Backwards ghost zones for the host input spinor + * @param oddBit 0 for D_eo, 1 for D_oe + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ template void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink, real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor, - real_t **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type); + real_t **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type); +/** + * @brief Apply even-odd or odd-even component of a staggered-type dslash + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param oddBit 0 for D_eo, 1 for D_oe + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type); +/** + * @brief Apply the full parity staggered-type dslash + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param mass Mass for the dslash operator + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); +/** + * @brief Apply the even-even or odd-odd preconditioned staggered dslash + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param mass Mass for the dslash operator + * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator + * @param tmp Temporary spinor field + * @param parity Parity of preconditioned dslash + * @param dslash_type Dslash type + */ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int dagger_bit, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type); From 9b631c300c53c7ef8fabcacb3bb56fc0597b030b Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 26 Dec 2023 07:42:13 -0800 Subject: [PATCH 41/53] Cleaned up some unnecessary temporary fields outside of verify functions --- tests/host_reference/dslash_reference.cpp | 24 +++++++++++-------- tests/host_reference/dslash_reference.h | 14 ++++------- .../staggered_dslash_reference.cpp | 6 ++++- .../staggered_dslash_reference.h | 3 +-- tests/staggered_dslash_test_utils.h | 2 +- tests/staggered_invert_test.cpp | 13 +++++----- 6 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 0b461076fd..fb04d173f7 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -743,22 +743,26 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou return l2r; } -std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param) { +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, + quda::GaugeField &long_link, QudaInvertParam &inv_param) { std::vector out_vector(1); out_vector[0] = out; - return verifyStaggeredInversion(tmp, ref, in, out_vector, fat_link, + return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param); } -std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param) +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, + quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param) { int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; double l2r_max = 0.0; double hqr_max = 0.0; + + // Create temporary spinors + quda::ColorSpinorParam csParam(in); + quda::ColorSpinorField ref(csParam); + quda::ColorSpinorField tmp(csParam); + if (multishift > 1) { if (dslash_type == QUDA_LAPLACE_DSLASH) errorQuda("Multishift solves do not support the laplace operator (yet)"); @@ -777,7 +781,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda for (int i = 0; i < multishift; i++) { auto& out = out_vector[i]; double mass = 0.5 * sqrt(inv_param.offset[i]); - stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); @@ -815,7 +819,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } - stag_matpc(ref, fat_link, long_link, out, mass, 0, tmp, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type); stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); @@ -873,7 +877,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } - stag_matpc(ref, fat_link, long_link, spinor, mass, 0, tmp, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type); } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) { stag_mat(tmp, fat_link, long_link, spinor, mass, dagger, dslash_type); stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 4d67134baa..0388b2a10d 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -113,8 +113,6 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes * an array of outputs as is necessary for handling both single- and multi-shift solves. * - * @param tmp A temporary spinor intermediate calculations - * @param ref A temporary reference field that is used to store the host verification solution * @param in The initial rhs * @param out The solution to A out = in * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied @@ -122,15 +120,12 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu * @param inv_param Invert params, used to query the solve type, etc * @return The residual and HQ residual (if requested) */ -std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param); +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, + quda::GaugeField &long_link, QudaInvertParam &inv_param); /** * @brief Verify a single- or multi-shift staggered inversion on the host * - * @param tmp A temporary spinor intermediate calculations - * @param ref A temporary reference field that is used to store the host verification solution * @param in The initial rhs * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied @@ -138,9 +133,8 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts * @return The residual and HQ residual (if requested) */ -std::array verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param); +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, + quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param); /** * @brief Verify a staggered-type eigenvector diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 4b04da1976..0f1b9c46dd 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -193,7 +193,7 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel } void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int, - ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type) + QudaParity parity, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); } @@ -207,6 +207,10 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi errorQuda("full parity not supported in function"); } + // Create temporary spinors + quda::ColorSpinorParam csParam(in); + quda::ColorSpinorField tmp(csParam); + // dagger bit does not matter stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type); stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type); diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index b81f0fcb7a..7b95adb318 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -69,9 +69,8 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel * @param in Host input spinor * @param mass Mass for the dslash operator * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator - * @param tmp Temporary spinor field * @param parity Parity of preconditioned dslash * @param dslash_type Dslash type */ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, - double mass, int dagger_bit, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type); + double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index e7eb39b07f..52a2f0b124 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -86,7 +86,7 @@ struct StaggeredDslashTestWrapper { stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type); break; case dslash_test_type::MatPC: - stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, tmpCpu, parity, dslash_type); + stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, parity, dslash_type); break; case dslash_test_type::Mat: stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 1c941a59a9..34a63c212f 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -280,12 +280,8 @@ std::vector> solve(test_t param) std::vector in(Nsrc); std::vector out(Nsrc); std::vector out_multishift(Nsrc * multishift); - quda::ColorSpinorField ref; - quda::ColorSpinorField tmp; quda::ColorSpinorParam cs_param; constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param); - ref = quda::ColorSpinorField(cs_param); - tmp = quda::ColorSpinorField(cs_param); std::vector> _hp_multi_x(Nsrc, std::vector(multishift)); // Staggered vector construct END @@ -340,7 +336,10 @@ std::vector> solve(test_t param) std::vector gflops(Nsrc); std::vector iter(Nsrc); - quda::RNG rng(ref, 1234); + // Create a temporary spinor just to seed the rng + quda::ColorSpinorField tmp(cs_param); + quda::RNG rng(tmp, 1234); + tmp = quda::ColorSpinorField(); for (int n = 0; n < Nsrc; n++) { // Populate the host spinor with random numbers. @@ -410,9 +409,9 @@ std::vector> solve(test_t param) printfQuda("\nSource %d:\n", n); // Create an appropriate subset of the full out_multishift vector std::vector out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift}; - res[n] = verifyStaggeredInversion(tmp, ref, in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param); + res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param); } else { - res[n] = verifyStaggeredInversion(tmp, ref, in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param); + res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param); } } } From 0f366c13f7fc01ed9b098464d339e3befb710bac Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 26 Dec 2023 07:56:00 -0800 Subject: [PATCH 42/53] Added a simple staggered host stag_matdag_mat verify function --- tests/host_reference/dslash_reference.cpp | 11 +++-------- .../staggered_dslash_reference.cpp | 19 +++++++++++++++++++ .../staggered_dslash_reference.h | 14 ++++++++++++++ tests/staggered_dslash_test_utils.h | 6 +----- 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index fb04d173f7..534d0b2a1b 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -747,8 +747,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda: quda::GaugeField &long_link, QudaInvertParam &inv_param) { std::vector out_vector(1); out_vector[0] = out; - return verifyStaggeredInversion(in, out_vector, fat_link, - long_link, inv_param); + return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param); } std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, @@ -761,7 +760,6 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: // Create temporary spinors quda::ColorSpinorParam csParam(in); quda::ColorSpinorField ref(csParam); - quda::ColorSpinorField tmp(csParam); if (multishift > 1) { if (dslash_type == QUDA_LAPLACE_DSLASH) @@ -821,8 +819,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: } stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { - stag_mat(tmp, fat_link, long_link, out, mass, dagger, dslash_type); - stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); + stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); } else { errorQuda("Invalid staggered solution type %d", inv_param.solution_type); } @@ -866,7 +863,6 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co // Create temporary spinors quda::ColorSpinorParam csParam(spinor); quda::ColorSpinorField ref(csParam); - quda::ColorSpinorField tmp(csParam); if (sol_type == QUDA_MAT_SOLUTION) { stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type); @@ -879,8 +875,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co } stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type); } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) { - stag_mat(tmp, fat_link, long_link, spinor, mass, dagger, dslash_type); - stag_mat(ref, fat_link, long_link, tmp, mass, 1 - dagger, dslash_type); + stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type); } // Compute M * x - \lambda * x diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 0f1b9c46dd..4eb46214e7 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -192,6 +192,25 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel } } +void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) +{ + // assert sPrecision and gPrecision must be the same + if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } + + // assert we have full-parity spinors + if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) + errorQuda("Unexpected site subsets for stag_matdagmat, out %d in %d", out.SiteSubset(), in.SiteSubset()); + + // Create temporary spinors + quda::ColorSpinorParam csParam(in); + quda::ColorSpinorField tmp(csParam); + + // Apply mat in sequence + stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type); + stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type); +} + void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type) { diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index 7b95adb318..c5b73d980b 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -60,6 +60,20 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); +/** + * @brief Apply the full parity staggered-type matdag_mat + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param mass Mass for the dslash operator + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ +void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, + double mass, int daggerBit, QudaDslashType dslash_type); + /** * @brief Apply the even-even or odd-odd preconditioned staggered dslash * diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 52a2f0b124..810c045863 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -53,7 +53,6 @@ struct StaggeredDslashTestWrapper { static inline ColorSpinorField spinor; static inline ColorSpinorField spinorOut; static inline ColorSpinorField spinorRef; - static inline ColorSpinorField tmpCpu; ColorSpinorField cudaSpinor; ColorSpinorField cudaSpinorOut; @@ -92,8 +91,7 @@ struct StaggeredDslashTestWrapper { stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); break; case dslash_test_type::MatDagMat: - stag_mat(tmpCpu, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); - stag_mat(spinorRef, cpuFat, cpuLong, tmpCpu, mass, 1 - dagger, dslash_type); + stag_matdag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); break; default: errorQuda("Test type %d not defined", static_cast(dtest_type)); } @@ -201,7 +199,6 @@ struct StaggeredDslashTestWrapper { spinor = ColorSpinorField(csParam); spinorOut = ColorSpinorField(csParam); spinorRef = ColorSpinorField(csParam); - tmpCpu = ColorSpinorField(csParam); spinor.Source(QUDA_RANDOM_SOURCE); @@ -307,7 +304,6 @@ struct StaggeredDslashTestWrapper { spinor = {}; spinorOut = {}; spinorRef = {}; - tmpCpu = {}; if (test_split_grid) { vp_spinor.clear(); From 91211105d66ca6cd62cdc03857ad40ee505e9cc0 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 26 Dec 2023 08:01:42 -0800 Subject: [PATCH 43/53] Added a few extra parity checks to staggered dslash host verifies --- tests/host_reference/staggered_dslash_reference.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 4eb46214e7..8fdc135cb2 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -131,6 +131,10 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } + // assert we have single-parity spinors + if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET) + errorQuda("Unexpected site subsets for stag_dslash, out %d in %d", out.SiteSubset(), in.SiteSubset()); + QudaParity otherparity = QUDA_INVALID_PARITY; if (oddBit == QUDA_EVEN_PARITY) { otherparity = QUDA_ODD_PARITY; @@ -217,6 +221,10 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); } + // assert we have single-parity spinors + if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET) + errorQuda("Unexpected site subsets for stag_matpc, out %d in %d", out.SiteSubset(), in.SiteSubset()); + QudaParity otherparity = QUDA_INVALID_PARITY; if (parity == QUDA_EVEN_PARITY) { otherparity = QUDA_ODD_PARITY; From a1303bd612cf8dfaf5b7a7070cd5871ad4b6b5ce Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 3 Jan 2024 14:06:46 -0800 Subject: [PATCH 44/53] Commented out the asqtad spectrum ctests --- tests/CMakeLists.txt | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5debeed21d..d6de7dcf5c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1402,13 +1402,15 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) - add_test(NAME eigensolve_test_asqtad_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type asqtad --compute-fat-long true - --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256 - --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 - --enable-testing true - --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) + # Skipping this because it's both time consuming and doesn't have any novel spectral + # properties relative to unimproved staggered + #add_test(NAME eigensolve_test_asqtad_${prec} + # COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + # --dslash-type asqtad --compute-fat-long true + # --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256 + # --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 + # --enable-testing true + # --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) if (QUDA_LAPLACE) add_test(NAME eigensolve_test_laplace_${prec} From 3252b6cdf1c1e66a27ce27677e9b776014e15190 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 3 Jan 2024 14:08:37 -0800 Subject: [PATCH 45/53] Removed twisted mass from the CI pipeline, other 4-d Wilson ops are still covered --- ci/docker/Dockerfile.build | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/docker/Dockerfile.build b/ci/docker/Dockerfile.build index 3bd1f20e8e..5322ddfd2d 100644 --- a/ci/docker/Dockerfile.build +++ b/ci/docker/Dockerfile.build @@ -40,7 +40,6 @@ RUN QUDA_TEST_GRID_SIZE="1 1 1 2" cmake -S /quda/src \ -DQUDA_DIRAC_DEFAULT_OFF=ON \ -DQUDA_DIRAC_WILSON=ON \ -DQUDA_DIRAC_CLOVER=ON \ - -DQUDA_DIRAC_TWISTED_MASS=ON \ -DQUDA_DIRAC_TWISTED_CLOVER=ON \ -DQUDA_DIRAC_STAGGERED=ON \ -GNinja \ From c8d301b9bc256e8068e6ebc3829ce82efdecc53c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 3 Jan 2024 14:46:43 -0800 Subject: [PATCH 46/53] Added an explicit link to the Nc = 64, 96 issue in spinor_dilute.in.cu --- lib/spinor_dilute.in.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu index 5472d828b2..4ac97bc9fb 100644 --- a/lib/spinor_dilute.in.cu +++ b/lib/spinor_dilute.in.cu @@ -90,7 +90,7 @@ namespace quda if constexpr (Nc <= 32) { SpinorDilute(src, v, type, local_block); } else { - errorQuda("nColor = %d is too large to compile, see QUDA issues"); + errorQuda("nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)"); } } else { if constexpr (sizeof...(N) > 0) From 6b0ba62df10a339e90c454a3fc893a3bbddc1985 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 3 Jan 2024 20:38:40 -0800 Subject: [PATCH 47/53] Cleaned up C-style casts, plus unnecessary newlines in errorQuda --- .../staggered_dslash_reference.cpp | 33 +++++++++++-------- tests/staggered_invert_test.cpp | 4 +-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 8fdc135cb2..cf053533da 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -158,16 +158,23 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF = {long_link.Ghost()[0].data(), long_link.Ghost()[1].data(), long_link.Ghost()[2].data(), long_link.Ghost()[3].data()}; if (in.Precision() == QUDA_DOUBLE_PRECISION) { - // note: qdp_fatlink and qdp_longlink, etc, can be replaced with feature/openmp's raw_pointer - staggeredDslashReference((double *)out.data(), (double **)qdp_fatlink, (double **)qdp_longlink, - (double**)ghost_fatlink, (double**)ghost_longlink, - (double *)in.data(), (double **)fwd_nbr_spinor, - (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type); + staggeredDslashReference(static_cast(out.data()), + reinterpret_cast(qdp_fatlink), + reinterpret_cast(qdp_longlink), + reinterpret_cast(ghost_fatlink), + reinterpret_cast(ghost_longlink), + static_cast(in.data()), + reinterpret_cast(in.fwdGhostFaceBuffer), + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); } else if (in.Precision() == QUDA_SINGLE_PRECISION) { - staggeredDslashReference((float *)out.data(), (float **)qdp_fatlink, (float **)qdp_longlink, - (float**)ghost_fatlink, (float**)ghost_longlink, - (float *)in.data(), (float **)fwd_nbr_spinor, - (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type); + staggeredDslashReference(static_cast(out.data()), + reinterpret_cast(qdp_fatlink), + reinterpret_cast(qdp_longlink), + reinterpret_cast(ghost_fatlink), + reinterpret_cast(ghost_longlink), + static_cast(in.data()), + reinterpret_cast(in.fwdGhostFaceBuffer), + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); } } @@ -190,9 +197,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel if (dslash_type == QUDA_LAPLACE_DSLASH) { double kappa = 1.0 / (8 + mass); - xpay((void*)in.data(), kappa, out.data(), out.Length(), out.Precision()); + xpay(in.data(), kappa, out.data(), out.Length(), out.Precision()); } else { - axpy(2 * mass, (void*)in.data(), out.data(), out.Length(), out.Precision()); + axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision()); } } @@ -244,8 +251,8 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi double msq_x4 = mass * mass * 4; if (in.Precision() == QUDA_DOUBLE_PRECISION) { - axmy((double *)in.data(), (double)msq_x4, (double *)out.data(), Vh * stag_spinor_site_size); + axmy(static_cast(in.data()), msq_x4, static_cast(out.data()), Vh * stag_spinor_site_size); } else { - axmy((float *)in.data(), (float)msq_x4, (float *)out.data(), Vh * stag_spinor_site_size); + axmy(static_cast(in.data()), static_cast(msq_x4), static_cast(out.data()), Vh * stag_spinor_site_size); } } diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 34a63c212f..7a288535fc 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -459,7 +459,7 @@ int main(int argc, char **argv) } if (inv_deflate && inv_multigrid) - errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n"); + errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve"); initRand(); @@ -477,7 +477,7 @@ int main(int argc, char **argv) // Need to add support for LAPLACE MG? if (inv_multigrid) { if (!is_staggered(dslash_type)) { - errorQuda("dslash_type %s not supported for multigrid preconditioner\n", get_dslash_str(dslash_type)); + errorQuda("dslash_type %s not supported for multigrid preconditioner", get_dslash_str(dslash_type)); } } From 4e533feb810b3f3ba27c42729a8059c5c0b286a1 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 3 Jan 2024 20:48:34 -0800 Subject: [PATCH 48/53] Added a cmake flag QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST to toggle running eigensolver dslash tests on the improved staggered operator, which are expensive --- CMakeLists.txt | 8 +++++++- tests/CMakeLists.txt | 19 ++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e66f12f5ce..eb8d85468f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -228,6 +228,8 @@ option(QUDA_CLOVER_DYNAMIC "Dynamically invert the clover term" ON) option(QUDA_CLOVER_RECONSTRUCT "set to ON to enable compressed clover storage (requires QUDA_CLOVER_DYNAMIC)" ON) option(QUDA_CLOVER_CHOLESKY_PROMOTE "Whether to promote the internal precision when inverting the clover term" ON) +option(QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST "Whether to run eigensolver ctests against the improved staggered operator (requires QUDA_DIRAC_STAGGERED)" OFF) + # Set CTest options option(QUDA_CTEST_SEP_DSLASH_POLICIES "Test Dslash policies separately in ctest instead of only autotuning them." OFF) option(QUDA_CTEST_DISABLE_BENCHMARKS "Disable benchmark test" ON) @@ -391,7 +393,11 @@ set(CMAKE_EXE_LINKER_FLAGS_SANITIZE CACHE STRING "Flags used by the linker during sanitizer debug builds.") if(QUDA_CLOVER_RECONSTRUCT AND NOT QUDA_CLOVER_DYNAMIC) - message(SEND_ERROR "QUDA_CLOVER_RECONSTRUCT requires QUDA_CLOVER_DYNAMIC)") + message(SEND_ERROR "QUDA_CLOVER_RECONSTRUCT requires QUDA_CLOVER_DYNAMIC") +endif() + +if (QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST AND NOT QUDA_DIRAC_STAGGERED) + message(SEND_ERROR "QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST requires QUDA_DIRAC_STAGGERED") endif() find_package(Threads REQUIRED) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d6de7dcf5c..10088fc05b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1402,15 +1402,16 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) - # Skipping this because it's both time consuming and doesn't have any novel spectral - # properties relative to unimproved staggered - #add_test(NAME eigensolve_test_asqtad_${prec} - # COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - # --dslash-type asqtad --compute-fat-long true - # --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256 - # --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 - # --enable-testing true - # --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) + # These tests are particularly expensive so they are disabled by default + if(QUDA_IMPROVED_STAGGERED_EIGENSOLVER_CTEST) + add_test(NAME eigensolve_test_asqtad_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type asqtad --compute-fat-long true + --eig-n-conv 32 --eig-n-ev 32 --eig-n-kr 256 + --dim 6 6 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 + --enable-testing true + --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) + endif() if (QUDA_LAPLACE) add_test(NAME eigensolve_test_laplace_${prec} From 1228db9e5941575175d547322d0d4e193ba49ede Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 3 Jan 2024 20:58:11 -0800 Subject: [PATCH 49/53] is_laplace_enabled -> is_enabled_laplace, other misc cleanup --- tests/host_reference/staggered_dslash_reference.cpp | 3 --- tests/staggered_dslash_ctest.cpp | 2 +- tests/staggered_dslash_test.cpp | 2 +- tests/staggered_eigensolve_test.cpp | 2 +- tests/staggered_eigensolve_test_gtest.hpp | 8 -------- tests/staggered_invert_test.cpp | 2 +- tests/utils/host_utils.h | 6 ++++-- 7 files changed, 8 insertions(+), 17 deletions(-) diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index cf053533da..610f81a0b4 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -147,9 +147,6 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF in.exchangeGhost(otherparity, nFace, daggerBit); - auto fwd_nbr_spinor = in.fwdGhostFaceBuffer; - auto back_nbr_spinor = in.backGhostFaceBuffer; - void *qdp_fatlink[] = {fat_link.data(0), fat_link.data(1), fat_link.data(2), fat_link.data(3)}; void *qdp_longlink[] = {long_link.data(0), long_link.data(1), long_link.data(2), long_link.data(3)}; void *ghost_fatlink[] diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index c035013568..2d5311632a 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -122,7 +122,7 @@ int main(int argc, char **argv) if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } // Only these fermions are supported in this file - if (is_laplace_enabled) { + if constexpr (is_enabled_laplace()) { if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index 0beb48f887..7905d39db6 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -85,7 +85,7 @@ int main(int argc, char **argv) if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } // Only these fermions are supported in this file - if (is_laplace_enabled) { + if constexpr (is_enabled_laplace()) { if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 85136b3972..e971e0327e 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -273,7 +273,7 @@ int main(int argc, char **argv) initRand(); // Only these fermions are supported in this file - if (is_laplace_enabled) { + if constexpr (is_enabled_laplace()) { if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp index ab879a9b37..6cf272cb5d 100644 --- a/tests/staggered_eigensolve_test_gtest.hpp +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -101,14 +101,6 @@ bool skip_test(test_t test_param) default: return true; break; } } - - // There seems to be some stubborn issue with this combination on 2xGPUs that I can't quite - // comprehend, and am a bit tired to debugging. - //if (prec == QUDA_SINGLE_PRECISION && dslash_type == QUDA_ASQTAD_DSLASH && - // eig_type == QUDA_EIG_BLK_TR_LANCZOS && spectrum == QUDA_SPECTRUM_SR_EIG && - // combo_solve_type == QUDA_DIRECT_PC_SOLVE) - // return true; - return false; } diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 7a288535fc..dea21be65e 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -464,7 +464,7 @@ int main(int argc, char **argv) initRand(); // Only these fermions are supported in this file - if (is_laplace_enabled) { + if constexpr (is_enabled_laplace()) { if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index b9449e8651..9431b3ce67 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -41,11 +41,13 @@ extern QudaPrecision &cuda_prec_refinement_sloppy; extern QudaPrecision &cuda_prec_ritz; // Determine if the Laplace operator has been defined +constexpr bool is_enabled_laplace() { #ifdef QUDA_LAPLACE -constexpr bool is_laplace_enabled = true; + return true; #else -constexpr bool is_laplace_enabled = false; + return false; #endif +} // Set some basic parameters via command line or use defaults // Implemented in set_params.cpp From 53dced96630d2269daa8f03eef60ff83c7a8dcac Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 4 Jan 2024 10:53:31 -0800 Subject: [PATCH 50/53] Small stylistic updates to BiCGstab to match conventions in other modern inverters --- include/invert_quda.h | 12 +++++++++++ lib/inv_bicgstab_quda.cpp | 44 ++++++++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/include/invert_quda.h b/include/invert_quda.h index 7cf26a6f4f..ef2923f07c 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -1059,6 +1059,13 @@ namespace quda { ColorSpinorField x_sloppy; // Sloppy solution accumulator vector bool init = false; + /** + @brief Initiate the fields needed by the solver + @param[in] x Solution vector + @param[in] b Source vector + */ + void create(ColorSpinorField &x, const ColorSpinorField &b); + public: BiCGstab(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, SolverParam ¶m, TimeProfile &profile); @@ -1066,6 +1073,11 @@ namespace quda { void operator()(ColorSpinorField &out, ColorSpinorField &in) override; + /** + @return Return the residual vector from the prior solve + */ + ColorSpinorField &get_residual() override; + virtual bool hermitian() const override { return false; } /** BiCGStab is for any linear system */ virtual QudaInverterType getInverterType() const final { return QUDA_BICGSTAB_INVERTER; } diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 3fa5afd849..0b0c039baa 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -25,6 +25,33 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_FREE); } + void BiCGstab::create(ColorSpinorField &x, const ColorSpinorField &b) + { + Solver::create(x, b); + + if (!init) { + if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT); + ColorSpinorParam csParam(x); + csParam.create = QUDA_ZERO_FIELD_CREATE; + y = ColorSpinorField(csParam); + r = ColorSpinorField(csParam); + csParam.setPrecision(param.precision_sloppy); + p = ColorSpinorField(csParam); + v = ColorSpinorField(csParam); + t = ColorSpinorField(csParam); + + if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT); + init = true; + } // init + } + + ColorSpinorField &BiCGstab::get_residual() + { + if (!init) errorQuda("No residual vector present"); + if (!param.return_residual) errorQuda("SolverParam::return_residual not enabled"); + return r; + } + int reliable(double &rNorm, double &maxrx, double &maxrr, const double &r2, const double &delta) { // reliable updates rNorm = sqrt(r2); @@ -41,6 +68,8 @@ namespace quda { void BiCGstab::operator()(ColorSpinorField &x, ColorSpinorField &b) { + create(x, b); + if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT); double b2 = blas::norm2(b); // norm sq of source @@ -53,7 +82,7 @@ namespace quda { x = b; param.true_res = 0.0; param.true_res_hq = 0.0; - profile.TPSTOP(QUDA_PROFILE_INIT); + if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT); return; } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { b2 = r2; @@ -62,19 +91,6 @@ namespace quda { } } - if (!init) { - ColorSpinorParam csParam(x); - csParam.create = QUDA_ZERO_FIELD_CREATE; - y = ColorSpinorField(csParam); - r = ColorSpinorField(csParam); - csParam.setPrecision(param.precision_sloppy); - p = ColorSpinorField(csParam); - v = ColorSpinorField(csParam); - t = ColorSpinorField(csParam); - - init = true; - } - if (param.deflate) { // Construct the eigensolver and deflation space if requested. if (param.eig_param.eig_type == QUDA_EIG_TR_LANCZOS || param.eig_param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) { From 209d554611d12676d769fbb49b1c4ad5e1decd2f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 5 Jan 2024 11:11:08 -0800 Subject: [PATCH 51/53] Fixed using BiCGstab for generating near-null vectors --- lib/inv_bicgstab_quda.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 0b0c039baa..1d586ee124 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -75,22 +75,6 @@ namespace quda { double b2 = blas::norm2(b); // norm sq of source double r2 = 0.0; // norm sq of residual - // Check to see that we're not trying to invert on a zero-field source - if (b2 == 0) { - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { - warningQuda("inverting on zero-field source"); - x = b; - param.true_res = 0.0; - param.true_res_hq = 0.0; - if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT); - return; - } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { - b2 = r2; - } else { - errorQuda("Null vector computing requires non-zero guess!"); - } - } - if (param.deflate) { // Construct the eigensolver and deflation space if requested. if (param.eig_param.eig_type == QUDA_EIG_TR_LANCZOS || param.eig_param.eig_type == QUDA_EIG_BLK_TR_LANCZOS) { @@ -140,6 +124,22 @@ namespace quda { r2 = blas::xmyNorm(b, r); } + // Check to see that we're not trying to invert on a zero-field source + if (b2 == 0) { + if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + warningQuda("inverting on zero-field source"); + x = b; + param.true_res = 0.0; + param.true_res_hq = 0.0; + if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT); + return; + } else if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { + b2 = r2; + } else { + errorQuda("Null vector computing requires non-zero guess!"); + } + } + // set field aliasing according to whether we are doing mixed precision or not if (param.precision_sloppy == x.Precision()) { r_sloppy = r.create_alias(); From ca6b814bf71f0ff9a8306cb83604a5cd86c1ee71 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 9 Jan 2024 13:45:57 -0800 Subject: [PATCH 52/53] clang-format --- include/invert_quda.h | 12 +- lib/eig_block_trlm.cpp | 2 +- lib/eig_trlm.cpp | 2 +- lib/interface_quda.cpp | 10 +- lib/inv_bicgstab_quda.cpp | 42 +++--- lib/spinor_dilute.in.cu | 3 +- tests/hisq_stencil_ctest.cpp | 44 +++--- tests/hisq_stencil_test.cpp | 26 ++-- tests/hisq_stencil_test_utils.h | 80 ++++++----- tests/host_reference/dslash_reference.cpp | 78 ++++++----- tests/host_reference/dslash_reference.h | 98 +++++++------- .../staggered_dslash_reference.cpp | 53 ++++---- .../staggered_dslash_reference.h | 128 +++++++++--------- tests/invert_test_gtest.hpp | 4 +- tests/staggered_dslash_ctest.cpp | 6 +- tests/staggered_dslash_test.cpp | 10 +- tests/staggered_dslash_test_utils.h | 13 +- tests/staggered_eigensolve_test.cpp | 45 +++--- tests/staggered_eigensolve_test_gtest.hpp | 22 ++- tests/staggered_invert_test.cpp | 48 ++++--- tests/staggered_invert_test_gtest.hpp | 53 ++++---- tests/utils/command_line_params.cpp | 23 ++-- tests/utils/host_utils.cpp | 2 +- tests/utils/host_utils.h | 3 +- tests/utils/staggered_gauge_utils.cpp | 3 +- 25 files changed, 418 insertions(+), 392 deletions(-) diff --git a/include/invert_quda.h b/include/invert_quda.h index ef2923f07c..7ab7a1138c 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -1049,12 +1049,12 @@ namespace quda { private: const DiracMdagM matMdagM; // used by the eigensolver - ColorSpinorField y; // Full precision solution accumulator - ColorSpinorField r; // Full precision residual vector - ColorSpinorField p; // Sloppy precision search direction - ColorSpinorField v; // Sloppy precision A * p - ColorSpinorField t; // Sloppy precision vector used for minres step - ColorSpinorField r0; // Bi-orthogonalization vector + ColorSpinorField y; // Full precision solution accumulator + ColorSpinorField r; // Full precision residual vector + ColorSpinorField p; // Sloppy precision search direction + ColorSpinorField v; // Sloppy precision A * p + ColorSpinorField t; // Sloppy precision vector used for minres step + ColorSpinorField r0; // Bi-orthogonalization vector ColorSpinorField r_sloppy; // Slopy precision residual vector ColorSpinorField x_sloppy; // Sloppy solution accumulator vector bool init = false; diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 8a8f90063a..890257c1ed 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -111,7 +111,7 @@ namespace quda // Lambda that returns mat_norm for LR and returns the relevant alpha // (the corresponding Ritz value) for SR - auto check_norm = [&] (double sr_norm) -> double { + auto check_norm = [&](double sr_norm) -> double { if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG) return mat_norm; else diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 2d6f1d2cac..d994fbc272 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -92,7 +92,7 @@ namespace quda // Lambda that returns mat_norm for LR and returns the relevant alpha // (the corresponding Ritz value) for SR - auto check_norm = [&] (double sr_norm) -> double { + auto check_norm = [&](double sr_norm) -> double { if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG) return mat_norm; else diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index a47f15a794..46fed2c43d 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3014,7 +3014,7 @@ void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_para template void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // color spinor field pointers, and inv_param void *h_gauge, void *milc_fatlinks, void *milc_longlinks, - QudaGaugeParam *gauge_param_, // gauge field pointers + QudaGaugeParam *gauge_param_, // gauge field pointers void *h_clover, void *h_clovinv, // clover field pointers Interface op, Args... args) { @@ -3036,8 +3036,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // Create a local copy of gauge_param that we can modify without perturbing // the original one - if (!gauge_param_) - errorQuda("Input gauge_param is null"); + if (!gauge_param_) errorQuda("Input gauge_param is null"); QudaGaugeParam gauge_param = *gauge_param_; if (num_sub_partition == 1) { // In this case we don't split the grid. @@ -3069,8 +3068,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } else if (milc_fatlinks) { is_staggered = true; if (param->dslash_type == QUDA_ASQTAD_DSLASH) { - if (!milc_longlinks) - errorQuda("milc_longlinks is null for an asqtad dslash"); + if (!milc_longlinks) errorQuda("milc_longlinks is null for an asqtad dslash"); is_asqtad = true; } } else { @@ -3239,7 +3237,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col loadGaugeQuda(collected_gauge.raw_pointer(), &gauge_param); } else { loadFatLongGaugeQuda(param, &gauge_param, collected_milc_fatlink_field.raw_pointer(), - (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr); + (is_asqtad) ? collected_milc_longlink_field.raw_pointer() : nullptr); } logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n"); diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 1d586ee124..742e026bd2 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -216,14 +216,13 @@ namespace quda { bool converged = convergence(r2, heavy_quark_res, stop, param.tol_hq); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) - printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", - blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p), - blas::norm2(r0), blas::norm2(t)); + printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", blas::norm2(x), blas::norm2(r_sloppy), + blas::norm2(v), blas::norm2(p), blas::norm2(r0), blas::norm2(t)); // track if we just performed an exact recalculation of y, r, r2 bool just_updated = false; - while ( !converged && k < param.maxiter) { + while (!converged && k < param.maxiter) { just_updated = false; matSloppy(v, p); @@ -253,7 +252,7 @@ namespace quda { double s2 = blas::norm2(r_sloppy); Complex r0t = blas::cDotProduct(r0, t); beta = -r0t / r0v; - r2 = s2 - real(omega * conj(tr)) ; + r2 = s2 - real(omega * conj(tr)); // now we can work out if we need to do a reliable update updateR = reliable(rNorm, maxrx, maxrr, r2, delta); } else { @@ -263,24 +262,24 @@ namespace quda { } if (param.pipeline && !updateR) { - //x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p + // x += alpha*p + omega*r, r -= omega*t, p = r - beta*omega*v + beta*p blas::caxpbypzYmbw(alpha, p, omega, r_sloppy, x_sloppy, t); - blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p); - //tripleBiCGstabUpdate(alpha, p, omega, r_sloppy, x_sloppy, t, -beta*omega, v, beta, p + blas::cxpaypbz(r_sloppy, -beta * omega, v, beta, p); + // tripleBiCGstabUpdate(alpha, p, omega, r_sloppy, x_sloppy, t, -beta*omega, v, beta, p } else { - //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) + // x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, r_sloppy, x_sloppy, t, r0); rho0 = rho; rho = Complex(rho_r2.x, rho_r2.y); r2 = rho_r2.z; } - if (use_heavy_quark_res && k % heavy_quark_check==0) { + if (use_heavy_quark_res && k % heavy_quark_check == 0) { if (&x != &x_sloppy) { - heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x_sloppy, r_sloppy).z); + heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x_sloppy, r_sloppy).z); } else { - blas::copy(r, r_sloppy); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z); + blas::copy(r, r_sloppy); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z); } } @@ -309,7 +308,7 @@ namespace quda { rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; - //r0Norm = rNorm; + // r0Norm = rNorm; rUpdate++; just_updated = true; @@ -319,9 +318,8 @@ namespace quda { PrintStats("BiCGstab", k, r2, b2, heavy_quark_res); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) - printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", - blas::norm2(x), blas::norm2(r_sloppy), blas::norm2(v), blas::norm2(p), - blas::norm2(r0), blas::norm2(t)); + printfQuda("BiCGstab debug: x2=%e, r2=%e, v2=%e, p2=%e, r0=%e, t2=%e\n", blas::norm2(x), blas::norm2(r_sloppy), + blas::norm2(v), blas::norm2(p), blas::norm2(r0), blas::norm2(t)); converged = convergence(r2, heavy_quark_res, stop, param.tol_hq); @@ -347,7 +345,7 @@ namespace quda { rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; - //r0Norm = rNorm; + // r0Norm = rNorm; rUpdate++; just_updated = true; @@ -362,9 +360,11 @@ namespace quda { // update p if ((!param.pipeline || updateR) && !converged) { // need to update if not pipeline or did a reliable update - if (abs(rho*alpha) == 0.0) beta = 0.0; - else beta = (rho/rho0) * (alpha/omega); - blas::cxpaypbz(r_sloppy, -beta*omega, v, beta, p); + if (abs(rho * alpha) == 0.0) + beta = 0.0; + else + beta = (rho / rho0) * (alpha / omega); + blas::cxpaypbz(r_sloppy, -beta * omega, v, beta, p); } } diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu index 4ac97bc9fb..eadd519c5b 100644 --- a/lib/spinor_dilute.in.cu +++ b/lib/spinor_dilute.in.cu @@ -90,7 +90,8 @@ namespace quda if constexpr (Nc <= 32) { SpinorDilute(src, v, type, local_block); } else { - errorQuda("nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)"); + errorQuda( + "nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)"); } } else { if constexpr (sizeof...(N) > 0) diff --git a/tests/hisq_stencil_ctest.cpp b/tests/hisq_stencil_ctest.cpp index 55186c6a5f..6df6d1d977 100644 --- a/tests/hisq_stencil_ctest.cpp +++ b/tests/hisq_stencil_ctest.cpp @@ -22,30 +22,31 @@ class HisqStencilTest : public ::testing::TestWithParam<::testing::tuple(::testing::get<0>(GetParam())); QudaReconstructType recon = static_cast(::testing::get<1>(GetParam())); - if ((QUDA_PRECISION & precision) == 0 - || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) - return true; + if ((QUDA_PRECISION & precision) == 0 || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) return true; - const std::array partition_enabled {true, true, true, false, true, false, false, false, - true, false, false, false, true, false, true, true}; + const std::array partition_enabled {true, true, true, false, true, false, false, false, + true, false, false, false, true, false, true, true}; if (!ctest_all_partitions && !partition_enabled[::testing::get<3>(GetParam())]) return true; return false; } - void display_test_info(QudaPrecision prec, QudaReconstructType link_recon, bool has_naik) { + void display_test_info(QudaPrecision prec, QudaReconstructType link_recon, bool has_naik) + { printfQuda("running the following test:\n"); - printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n"); + printfQuda( + "link_precision link_reconstruct space_dimension T_dimension Ordering\n"); printfQuda("%s %s %d/%d/%d/ %d %s \n", - get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order)); + get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order)); printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), - dimPartitioned(3)); + dimPartitioned(3)); printfQuda("Number of Naiks: %d\n", has_naik ? 2 : 1); } public: - virtual void SetUp() { + virtual void SetUp() + { QudaPrecision prec = static_cast(::testing::get<0>(GetParam())); QudaReconstructType recon = static_cast(::testing::get<1>(GetParam())); bool has_naik = ::testing::get<2>(GetParam()); @@ -62,14 +63,13 @@ class HisqStencilTest : public ::testing::TestWithParam<::testing::tuple> param) +std::string +gethisqstenciltestname(testing::TestParamInfo<::testing::tuple> param) { const QudaPrecision prec = static_cast(::testing::get<0>(param.param)); const QudaReconstructType recon = static_cast(::testing::get<1>(param.param)); @@ -165,15 +163,11 @@ std::string gethisqstenciltestname(testing::TestParamInfo<::testing::tuple, 3> act_paths; // initial links in MILC order - static inline void* milc_sitelink = nullptr; + static inline void *milc_sitelink = nullptr; // storage for CPU reference fat and long links w/zero Naik static inline void *fat_reflink[4] = {nullptr, nullptr, nullptr, nullptr}; @@ -69,7 +69,8 @@ struct HisqStencilTestWrapper { static inline void *qdp_fatlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; static inline void *qdp_longlink_eps[4] = {nullptr, nullptr, nullptr, nullptr}; - void set_naik(bool has_naik) { + void set_naik(bool has_naik) + { if (has_naik) { eps_naik = -0.03; // semi-arbitrary n_naiks = 2; @@ -79,7 +80,8 @@ struct HisqStencilTestWrapper { } } - void init_ctest(QudaPrecision prec_, QudaReconstructType link_recon_, bool has_naik) { + void init_ctest(QudaPrecision prec_, QudaReconstructType link_recon_, bool has_naik) + { prec = prec_; link_recon = link_recon_; @@ -101,7 +103,8 @@ struct HisqStencilTestWrapper { init(); } - void init_test() { + void init_test() + { gauge_param = newQudaGaugeParam(); setStaggeredGaugeParam(gauge_param); @@ -113,7 +116,8 @@ struct HisqStencilTestWrapper { init(); } - void init_host() { + void init_host() + { setDims(gauge_param.X); dw_setDims(gauge_param.X, 1); @@ -142,12 +146,12 @@ struct HisqStencilTestWrapper { // Second path: create X, long links act_paths[1] = { ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */ - /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ - (-1.0 / 24.0), /* Naik */ - (-1.0 / 8.0) * 0.5, /* simple staple */ - (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ - (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ - (-2.0 / 16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */ + /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ + (-1.0 / 24.0), /* Naik */ + (-1.0 / 8.0) * 0.5, /* simple staple */ + (1.0 / 8.0) * 0.25 * 0.5, /* displace link in two directions */ + (-1.0 / 8.0) * 0.125 * (1.0 / 6.0), /* displace link in three directions */ + (-2.0 / 16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */ }; // Paths for epsilon corrections. Not used if n_naiks = 1. @@ -165,7 +169,7 @@ struct HisqStencilTestWrapper { //////////////////////////////////// setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, - svd_abs_error); + svd_abs_error); ///////////////// // Input links // @@ -214,7 +218,8 @@ struct HisqStencilTestWrapper { #endif } - void init() { + void init() + { // reset the reconstruct in gauge param gauge_param.reconstruct = link_recon; @@ -245,7 +250,8 @@ struct HisqStencilTestWrapper { } } - static void end() { + static void end() + { if (milc_sitelink) host_free(milc_sitelink); // Clean up GPU compute links @@ -262,7 +268,8 @@ struct HisqStencilTestWrapper { freeGaugeQuda(); } - static void destroy() { + static void destroy() + { for (int i = 0; i < 4; i++) { host_free(fat_reflink[i]); @@ -292,7 +299,8 @@ struct HisqStencilTestWrapper { // X -- after 2nd level of smearing, non-SU(3) /*--------------------------------------------------------------------*/ - double llfatCUDA(int niter) { + double llfatCUDA(int niter) + { host_timer_t host_timer; comm_barrier(); @@ -337,7 +345,8 @@ struct HisqStencilTestWrapper { return host_timer.last(); } - void run_test(int niter, bool print_metrics = false) { + void run_test(int niter, bool print_metrics = false) + { ////////////////////// // Perform GPU test // ////////////////////// @@ -357,26 +366,26 @@ struct HisqStencilTestWrapper { if (print_metrics) { // FIXME: does not include unitarization, extra naiks int volume = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3]; - //long long flops = 61632 * (long long)niter; // Constructing V field + // long long flops = 61632 * (long long)niter; // Constructing V field // Constructing W field? // Constructing separate Naiks - //flops += 61632 * (long long)niter; // Constructing X field - //flops += (252 * 4) * (long long)niter; // long-link contribution + // flops += 61632 * (long long)niter; // Constructing X field + // flops += (252 * 4) * (long long)niter; // long-link contribution printfQuda("%fus per HISQ link build\n", 1e6 * secs / niter); printfQuda("%llu flops per HISQ link build, %llu flops per site %llu bytes per site\n", flops / niter, - (flops / niter) / volume, (bytes / niter) / volume); + (flops / niter) / volume, (bytes / niter) / volume); double gflops = 1.0e-9 * flops / secs; - printfQuda("GFLOPS = %f\n", gflops); + printfQuda("GFLOPS = %f\n", gflops); double gbytes = 1.0e-9 * bytes / secs; - printfQuda("GBYTES = %f\n", gbytes); + printfQuda("GBYTES = %f\n", gbytes); // Old metric - //double perf = flops / (secs * 1024 * 1024 * 1024); - //printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf); + // double perf = flops / (secs * 1024 * 1024 * 1024); + // printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs * 1000) / niter, perf); } } @@ -407,8 +416,8 @@ struct HisqStencilTestWrapper { if (n_naiks > 1) { for (int dir = 0; dir < 4; dir++) { res[0] = std::max(res[0], - compare_floats_v2(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, max_dev, - gauge_param.cpu_prec)); + compare_floats_v2(fat_reflink_eps[dir], qdp_fatlink_eps[dir], V * gauge_site_size, max_dev, + gauge_param.cpu_prec)); } strong_check_link(qdp_fatlink_eps, "Fat link GPU results: ", fat_reflink_eps, "CPU reference results:", V, @@ -416,32 +425,35 @@ struct HisqStencilTestWrapper { for (int dir = 0; dir < 4; ++dir) { res[1] = std::max(res[1], - compare_floats_v2(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, max_dev, - gauge_param.cpu_prec)); + compare_floats_v2(long_reflink_eps[dir], qdp_longlink_eps[dir], V * gauge_site_size, max_dev, + gauge_param.cpu_prec)); } strong_check_link(qdp_longlink_eps, "Long link GPU results: ", long_reflink_eps, "CPU reference results:", V, gauge_param.cpu_prec); } else { for (int dir = 0; dir < 4; dir++) { - res[0] = std::max(res[0], + res[0] = std::max( + res[0], compare_floats_v2(fat_reflink[dir], qdp_fatlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec)); } - strong_check_link(qdp_fatlink, "Fat link GPU results: ", fat_reflink, "CPU reference results:", V, gauge_param.cpu_prec); + strong_check_link(qdp_fatlink, "Fat link GPU results: ", fat_reflink, "CPU reference results:", V, + gauge_param.cpu_prec); for (int dir = 0; dir < 4; ++dir) { - res[1] = std::max(res[1], + res[1] = std::max( + res[1], compare_floats_v2(long_reflink[dir], qdp_longlink[dir], V * gauge_site_size, max_dev, gauge_param.cpu_prec)); } - strong_check_link(qdp_longlink, "Long link GPU results: ", long_reflink, "CPU reference results:", V, gauge_param.cpu_prec); + strong_check_link(qdp_longlink, "Long link GPU results: ", long_reflink, "CPU reference results:", V, + gauge_param.cpu_prec); } printfQuda("Fat link test %s\n", (res[0] < max_dev) ? "PASSED" : "FAILED"); printfQuda("Long link test %s\n", (res[1] < max_dev) ? "PASSED" : "FAILED"); return res; - } }; diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 534d0b2a1b..bb7efa83a4 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -743,15 +743,19 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou return l2r; } -std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, - quda::GaugeField &long_link, QudaInvertParam &inv_param) { +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, + quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param) +{ std::vector out_vector(1); out_vector[0] = out; return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param); } -std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, - quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param) +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, + std::vector &out_vector, + quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param) { int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; double l2r_max = 0.0; @@ -762,8 +766,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: quda::ColorSpinorField ref(csParam); if (multishift > 1) { - if (dslash_type == QUDA_LAPLACE_DSLASH) - errorQuda("Multishift solves do not support the laplace operator (yet)"); + if (dslash_type == QUDA_LAPLACE_DSLASH) errorQuda("Multishift solves do not support the laplace operator (yet)"); if (inv_param.solution_type != QUDA_MATPC_SOLUTION) errorQuda("Invalid staggered multishift solution type %d, expected QUDA_MATPC_SOLUTION", inv_param.solution_type); @@ -771,13 +774,13 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: // Check the mat_pc type and make sure it's sane QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { - case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; - case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; - default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; + case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; + case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; + default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } for (int i = 0; i < multishift; i++) { - auto& out = out_vector[i]; + auto &out = out_vector[i]; double mass = 0.5 * sqrt(inv_param.offset[i]); stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); @@ -789,9 +792,9 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: printfQuda("%dth solution: mass=%f, ", i, mass); printfQuda("Shift %2d residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, " - "QUDA = %9.6e, host = %9.6e\n", - i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, - inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr); + "QUDA = %9.6e, host = %9.6e\n", + i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i], + inv_param.true_res_hq_offset[i], hqr); // Empirical: if the cpu residue is more than 1 order the target accuracy, then it fails to converge if (sqrt(nrm2 / src2) > 10 * inv_param.tol_offset[i]) { printfQuda("Shift %2d has empirically failed to converge\n", i); @@ -802,20 +805,19 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: } } else { - auto& out = out_vector[0]; + auto &out = out_vector[0]; double mass = inv_param.mass; if (inv_param.solution_type == QUDA_MAT_SOLUTION) { stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); // correct for the massRescale function inside invertQuda - if (is_laplace(dslash_type)) - ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); + if (is_laplace(dslash_type)) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { - case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; - case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; - default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; + case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; + case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; + default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { @@ -831,8 +833,8 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: double l2r = sqrt(nrm2 / src2); printfQuda("Residuals: (L2 relative) tol %9.6e, QUDA = %9.6e, host = %9.6e; (heavy-quark) tol %9.6e, QUDA = %9.6e, " - "host = %9.6e\n", - inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); + "host = %9.6e\n", + inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); l2r_max = l2r; hqr_max = hqr; @@ -841,10 +843,10 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std:: return {l2r_max, hqr_max}; } -double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i, - QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link) +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link) { - QudaInvertParam& inv_param = *(eig_param.invert_param); + QudaInvertParam &inv_param = *(eig_param.invert_param); int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false); bool normop = (eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? true : false); @@ -853,11 +855,15 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co // Reverse engineer a "solution_type" to help determine which host dslash needs to be applied QudaSolutionType sol_type = QUDA_INVALID_SOLUTION; if (normop) { - if (use_pc) errorQuda("The normal preconditioned staggered op is not supported"); - else sol_type = QUDA_MATDAG_MAT_SOLUTION; + if (use_pc) + errorQuda("The normal preconditioned staggered op is not supported"); + else + sol_type = QUDA_MATDAG_MAT_SOLUTION; } else { - if (use_pc) sol_type = QUDA_MATPC_SOLUTION; - else sol_type = QUDA_MAT_SOLUTION; + if (use_pc) + sol_type = QUDA_MATPC_SOLUTION; + else + sol_type = QUDA_MAT_SOLUTION; } // Create temporary spinors @@ -869,9 +875,9 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co } else if (sol_type == QUDA_MATPC_SOLUTION) { QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { - case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; - case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; - default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; + case QUDA_MATPC_EVEN_EVEN: parity = QUDA_EVEN_PARITY; break; + case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; + default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type); } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) { @@ -889,16 +895,16 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Co return l2r; } -double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i, - QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link) +double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right, + double _Complex sigma, int i, QudaEigParam &eig_param, + quda::GaugeField &fat_link, quda::GaugeField &long_link) { - QudaInvertParam& inv_param = *(eig_param.invert_param); + QudaInvertParam &inv_param = *(eig_param.invert_param); int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false); double mass = inv_param.mass; - if (use_pc) - errorQuda("The SVD of the preconditioned staggered op is not supported"); + if (use_pc) errorQuda("The SVD of the preconditioned staggered op is not supported"); // Create temporary spinors quda::ColorSpinorParam csParam(spinor_left); diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 0388b2a10d..c464836f71 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -110,60 +110,64 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv); /** - * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes - * an array of outputs as is necessary for handling both single- and multi-shift solves. - * - * @param in The initial rhs - * @param out The solution to A out = in - * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied - * @param long_link The long links; null for naive staggered and Laplace - * @param inv_param Invert params, used to query the solve type, etc - * @return The residual and HQ residual (if requested) - */ -std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, - quda::GaugeField &long_link, QudaInvertParam &inv_param); + * @brief Verify a staggered inversion on the host. This version is a thin wrapper around a version that takes + * an array of outputs as is necessary for handling both single- and multi-shift solves. + * + * @param in The initial rhs + * @param out The solution to A out = in + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @param inv_param Invert params, used to query the solve type, etc + * @return The residual and HQ residual (if requested) + */ +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, + quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param); /** - * @brief Verify a single- or multi-shift staggered inversion on the host - * - * @param in The initial rhs - * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve - * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied - * @param long_link The long links; null for naive staggered and Laplace - * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts - * @return The residual and HQ residual (if requested) - */ -std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, - quda::GaugeField &fat_link, quda::GaugeField &long_link, QudaInvertParam &inv_param); + * @brief Verify a single- or multi-shift staggered inversion on the host + * + * @param in The initial rhs + * @param out The solutions to (A + shift) out = in for multiple shifts; shift == 0 for a single shift solve + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts + * @return The residual and HQ residual (if requested) + */ +std::array verifyStaggeredInversion(quda::ColorSpinorField &in, + std::vector &out_vector, + quda::GaugeField &fat_link, quda::GaugeField &long_link, + QudaInvertParam &inv_param); /** - * @brief Verify a staggered-type eigenvector - * - * @param spinor The host eigenvector to be verified - * @param lambda The host eigenvalue to be verified - * @param i The number of the eigenvalue, only used when printing outputs - * @param eig_param Eigensolve params, used to query the operator type, etc - * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied - * @param long_link The long links; null for naive staggered and Laplace - * @return The residual norm - */ -double verifyStaggeredTypeEigenvector(quda::ColorSpinorField& spinor, double _Complex lambda, int i, + * @brief Verify a staggered-type eigenvector + * + * @param spinor The host eigenvector to be verified + * @param lambda The host eigenvalue to be verified + * @param i The number of the eigenvalue, only used when printing outputs + * @param eig_param Eigensolve params, used to query the operator type, etc + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @return The residual norm + */ +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i, QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); /** - * @brief Verify a staggered-type singular vector - * - * @param spinor The host left singular vector to be verified - * @param spinor_right The host right singular vector to be verified - * @param lambda The host singular value to be verified - * @param i The number of the singular value, only used when printing outputs - * @param eig_param Eigensolve params, used to query the operator type, etc - * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied - * @param long_link The long links; null for naive staggered and Laplace - * @return The residual norm - */ -double verifyStaggeredTypeSingularVector(quda::ColorSpinorField& spinor_left, quda::ColorSpinorField &spinor_right, double _Complex sigma, int i, - QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); + * @brief Verify a staggered-type singular vector + * + * @param spinor The host left singular vector to be verified + * @param spinor_right The host right singular vector to be verified + * @param lambda The host singular value to be verified + * @param i The number of the singular value, only used when printing outputs + * @param eig_param Eigensolve params, used to query the operator type, etc + * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied + * @param long_link The long links; null for naive staggered and Laplace + * @return The residual norm + */ +double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right, + double _Complex sigma, int i, QudaEigParam &eig_param, + quda::GaugeField &fat_link, quda::GaugeField &long_link); // i represents a "half index" into an even or odd "half lattice". // when oddBit={0,1} the half lattice is {even,odd}. diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 610f81a0b4..bf6bcf8b92 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -129,7 +129,9 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } + if (in.Precision() != fat_link.Precision()) { + errorQuda("The spinor precision and gauge precision are not the same"); + } // assert we have single-parity spinors if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET) @@ -151,27 +153,21 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF void *qdp_longlink[] = {long_link.data(0), long_link.data(1), long_link.data(2), long_link.data(3)}; void *ghost_fatlink[] = {fat_link.Ghost()[0].data(), fat_link.Ghost()[1].data(), fat_link.Ghost()[2].data(), fat_link.Ghost()[3].data()}; - void *ghost_longlink[] - = {long_link.Ghost()[0].data(), long_link.Ghost()[1].data(), long_link.Ghost()[2].data(), long_link.Ghost()[3].data()}; + void *ghost_longlink[] = {long_link.Ghost()[0].data(), long_link.Ghost()[1].data(), long_link.Ghost()[2].data(), + long_link.Ghost()[3].data()}; if (in.Precision() == QUDA_DOUBLE_PRECISION) { - staggeredDslashReference(static_cast(out.data()), - reinterpret_cast(qdp_fatlink), - reinterpret_cast(qdp_longlink), - reinterpret_cast(ghost_fatlink), - reinterpret_cast(ghost_longlink), - static_cast(in.data()), - reinterpret_cast(in.fwdGhostFaceBuffer), - reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); + staggeredDslashReference(static_cast(out.data()), reinterpret_cast(qdp_fatlink), + reinterpret_cast(qdp_longlink), reinterpret_cast(ghost_fatlink), + reinterpret_cast(ghost_longlink), static_cast(in.data()), + reinterpret_cast(in.fwdGhostFaceBuffer), + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); } else if (in.Precision() == QUDA_SINGLE_PRECISION) { - staggeredDslashReference(static_cast(out.data()), - reinterpret_cast(qdp_fatlink), - reinterpret_cast(qdp_longlink), - reinterpret_cast(ghost_fatlink), - reinterpret_cast(ghost_longlink), - static_cast(in.data()), - reinterpret_cast(in.fwdGhostFaceBuffer), - reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); + staggeredDslashReference(static_cast(out.data()), reinterpret_cast(qdp_fatlink), + reinterpret_cast(qdp_longlink), reinterpret_cast(ghost_fatlink), + reinterpret_cast(ghost_longlink), static_cast(in.data()), + reinterpret_cast(in.fwdGhostFaceBuffer), + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); } } @@ -179,7 +175,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } + if (in.Precision() != fat_link.Precision()) { + errorQuda("The spinor precision and gauge precision are not the same"); + } // assert we have full-parity spinors if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) @@ -201,10 +199,12 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel } void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precision are not the same"); } + if (in.Precision() != fat_link.Precision()) { + errorQuda("The spinor precision and gauge precision are not the same"); + } // assert we have full-parity spinors if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) @@ -219,8 +219,8 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type); } -void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, double mass, int, - QudaParity parity, QudaDslashType dslash_type) +void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type) { // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); } @@ -248,8 +248,9 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi double msq_x4 = mass * mass * 4; if (in.Precision() == QUDA_DOUBLE_PRECISION) { - axmy(static_cast(in.data()), msq_x4, static_cast(out.data()), Vh * stag_spinor_site_size); + axmy(static_cast(in.data()), msq_x4, static_cast(out.data()), Vh * stag_spinor_site_size); } else { - axmy(static_cast(in.data()), static_cast(msq_x4), static_cast(out.data()), Vh * stag_spinor_site_size); + axmy(static_cast(in.data()), static_cast(msq_x4), static_cast(out.data()), + Vh * stag_spinor_site_size); } } diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index c5b73d980b..b39287bfb1 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -12,79 +12,79 @@ using namespace quda; void setDims(int *); /** - * @brief Base host routine to apply the even-odd or odd-even component of a staggered-type dslash - * - * @tparam real_t Datatype used in the host dslash - * @param res Host output result - * @param fatlink Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash - * @param longlink Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash - * @param ghostFatlink Ghost zones for the host fat links - * @param ghostLonglink Ghost zones for the host long links - * @param spinorField Host input spinor - * @param fwd_nbr_spinor Forward ghost zones for the host input spinor - * @param back_nbr_spinor Backwards ghost zones for the host input spinor - * @param oddBit 0 for D_eo, 1 for D_oe - * @param daggerBit 0 for the regular operator, 1 for the dagger operator - * @param dslash_type Dslash type - */ + * @brief Base host routine to apply the even-odd or odd-even component of a staggered-type dslash + * + * @tparam real_t Datatype used in the host dslash + * @param res Host output result + * @param fatlink Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param longlink Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param ghostFatlink Ghost zones for the host fat links + * @param ghostLonglink Ghost zones for the host long links + * @param spinorField Host input spinor + * @param fwd_nbr_spinor Forward ghost zones for the host input spinor + * @param back_nbr_spinor Backwards ghost zones for the host input spinor + * @param oddBit 0 for D_eo, 1 for D_oe + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ template void staggeredDslashReference(real_t *res, real_t **fatlink, real_t **longlink, real_t **ghostFatlink, real_t **ghostLonglink, real_t *spinorField, real_t **fwd_nbr_spinor, real_t **back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type); /** - * @brief Apply even-odd or odd-even component of a staggered-type dslash - * - * @param out Host output rhs - * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash - * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash - * @param in Host input spinor - * @param oddBit 0 for D_eo, 1 for D_oe - * @param daggerBit 0 for the regular operator, 1 for the dagger operator - * @param dslash_type Dslash type - */ -void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, - int oddBit, int daggerBit, QudaDslashType dslash_type); + * @brief Apply even-odd or odd-even component of a staggered-type dslash + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param oddBit 0 for D_eo, 1 for D_oe + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ +void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type); /** - * @brief Apply the full parity staggered-type dslash - * - * @param out Host output rhs - * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash - * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash - * @param in Host input spinor - * @param mass Mass for the dslash operator - * @param daggerBit 0 for the regular operator, 1 for the dagger operator - * @param dslash_type Dslash type - */ -void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, - double mass, int daggerBit, QudaDslashType dslash_type); + * @brief Apply the full parity staggered-type dslash + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param mass Mass for the dslash operator + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ +void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); /** - * @brief Apply the full parity staggered-type matdag_mat - * - * @param out Host output rhs - * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash - * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash - * @param in Host input spinor - * @param mass Mass for the dslash operator - * @param daggerBit 0 for the regular operator, 1 for the dagger operator - * @param dslash_type Dslash type - */ -void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, - double mass, int daggerBit, QudaDslashType dslash_type); + * @brief Apply the full parity staggered-type matdag_mat + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param mass Mass for the dslash operator + * @param daggerBit 0 for the regular operator, 1 for the dagger operator + * @param dslash_type Dslash type + */ +void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); /** - * @brief Apply the even-even or odd-odd preconditioned staggered dslash - * - * @param out Host output rhs - * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash - * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash - * @param in Host input spinor - * @param mass Mass for the dslash operator - * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator - * @param parity Parity of preconditioned dslash - * @param dslash_type Dslash type - */ -void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, const ColorSpinorField &in, - double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type); + * @brief Apply the even-even or odd-odd preconditioned staggered dslash + * + * @param out Host output rhs + * @param fat_link Fat links for an asqtad dslash, or the gauge links for a staggered or Laplace dslash + * @param long_link Long links for an asqtad dslash, or an empty GaugeField for staggered or Laplace dslash + * @param in Host input spinor + * @param mass Mass for the dslash operator + * @param dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator + * @param parity Parity of preconditioned dslash + * @param dslash_type Dslash type + */ +void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, + const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type); diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp index dca4bc5e9a..55c1c3f788 100644 --- a/tests/invert_test_gtest.hpp +++ b/tests/invert_test_gtest.hpp @@ -69,9 +69,7 @@ TEST_P(InvertTest, verify) if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq; auto tol = inv_param.tol; - if (is_chiral(inv_param.dslash_type)) { - tol *= std::sqrt(static_cast(inv_param.Ls)); - } + if (is_chiral(inv_param.dslash_type)) { tol *= std::sqrt(static_cast(inv_param.Ls)); } // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50; // Slight loss of precision possible when reconstructing full solution diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index 2d5311632a..65edd69124 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -126,10 +126,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) - errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (!is_staggered(dslash_type)) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index 7905d39db6..82c84c3225 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -52,8 +52,8 @@ TEST_F(StaggeredDslashTest, verify) double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec); // give it a tiny bump for fixed precision, recon 8 - if (dslash_test_wrapper.inv_param.cuda_prec <= QUDA_HALF_PRECISION && - dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9) + if (dslash_test_wrapper.inv_param.cuda_prec <= QUDA_HALF_PRECISION + && dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9) tol *= 1.1; ASSERT_LE(deviation, tol) << "reference and QUDA implementations do not agree"; @@ -89,10 +89,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) - errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (!is_staggered(dslash_type)) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 810c045863..0a3d589ca1 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -81,15 +81,9 @@ struct StaggeredDslashTestWrapper { // compare to dslash reference implementation printfQuda("Calculating reference implementation..."); switch (dtest_type) { - case dslash_test_type::Dslash: - stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type); - break; - case dslash_test_type::MatPC: - stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, parity, dslash_type); - break; - case dslash_test_type::Mat: - stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); - break; + case dslash_test_type::Dslash: stag_dslash(spinorRef, cpuFat, cpuLong, spinor, parity, dagger, dslash_type); break; + case dslash_test_type::MatPC: stag_matpc(spinorRef, cpuFat, cpuLong, spinor, mass, 0, parity, dslash_type); break; + case dslash_test_type::Mat: stag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); break; case dslash_test_type::MatDagMat: stag_matdag_mat(spinorRef, cpuFat, cpuLong, spinor, mass, dagger, dslash_type); break; @@ -214,7 +208,6 @@ struct StaggeredDslashTestWrapper { // set verbosity prior to loadGaugeQuda setVerbosity(verbosity); - } void init() diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index e971e0327e..6e717437fe 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -178,13 +178,12 @@ std::vector eigensolve(test_t test_param) } logQuda(QUDA_SUMMARIZE, "Action = %s, Solver = %s, norm-op = %s, even-odd = %s, with SVD = %s, spectrum = %s\n", - get_dslash_str(dslash_type), - get_eig_type_str(eig_param.eig_type), eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? "true" : "false", + get_dslash_str(dslash_type), get_eig_type_str(eig_param.eig_type), + eig_param.use_norm_op == QUDA_BOOLEAN_TRUE ? "true" : "false", eig_param.use_pc == QUDA_BOOLEAN_TRUE ? "true" : "false", eig_param.compute_svd == QUDA_BOOLEAN_TRUE ? "true" : "false", get_eig_spectrum_str(eig_param.spectrum)); - if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) - display_test_info(eig_param); + if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) display_test_info(eig_param); // Vector construct START //---------------------------------------------------------------------------- @@ -228,7 +227,8 @@ std::vector eigensolve(test_t test_param) for (int i = 0; i < eig_n_conv; i++) { if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) { double _Complex sigma = evals[i]; - residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP, cpuLongQDP); + residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP, + cpuLongQDP); } else { double _Complex lambda = evals[i]; residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP); @@ -277,10 +277,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) - errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (!is_staggered(dslash_type)) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } if (eig_param.arpack_check && !(prec == QUDA_DOUBLE_PRECISION)) { @@ -288,10 +286,11 @@ int main(int argc, char **argv) } // Sanity check combinations of solve type and solution type - if ((solve_type == QUDA_DIRECT_SOLVE && solution_type != QUDA_MAT_SOLUTION) || - (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type != QUDA_MATPC_SOLUTION) || - (solve_type == QUDA_NORMOP_SOLVE && solution_type != QUDA_MATDAG_MAT_SOLUTION)) { - errorQuda("Invalid combination of solve_type %s and solution_type %s", get_solve_str(solve_type), get_solution_str(solution_type)); + if ((solve_type == QUDA_DIRECT_SOLVE && solution_type != QUDA_MAT_SOLUTION) + || (solve_type == QUDA_DIRECT_PC_SOLVE && solution_type != QUDA_MATPC_SOLUTION) + || (solve_type == QUDA_NORMOP_SOLVE && solution_type != QUDA_MATDAG_MAT_SOLUTION)) { + errorQuda("Invalid combination of solve_type %s and solution_type %s", get_solve_str(solve_type), + get_solution_str(solution_type)); } initQuda(device_ordinal); @@ -301,12 +300,24 @@ int main(int argc, char **argv) // the staggered tests will fail. These checks are designed to be consistent // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked" bool changes = false; - if (!compute_fatlong) { compute_fatlong = true; changes = true; } + if (!compute_fatlong) { + compute_fatlong = true; + changes = true; + } double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-4 : 1e-5; - if (eig_tol != expected_tol) { eig_tol = expected_tol; changes = true; } - if (niter != 1000) { niter = 1000; changes = true; } - if (eig_n_kr != 256) { eig_n_kr = 256; changes = true; } + if (eig_tol != expected_tol) { + eig_tol = expected_tol; + changes = true; + } + if (niter != 1000) { + niter = 1000; + changes = true; + } + if (eig_n_kr != 256) { + eig_n_kr = 256; + changes = true; + } if (eig_block_size != 4) { eig_block_size = 4; } if (changes) { diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp index 6cf272cb5d..382510f74b 100644 --- a/tests/staggered_eigensolve_test_gtest.hpp +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -12,7 +12,8 @@ class StaggeredEigensolveTest : public ::testing::TestWithParam }; // Get the solve type that this combination corresponds to -QudaSolveType get_solve_type(QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd) { +QudaSolveType get_solve_type(QudaBoolean use_norm_op, QudaBoolean use_pc, QudaBoolean compute_svd) +{ if (use_norm_op == QUDA_BOOLEAN_FALSE && use_pc == QUDA_BOOLEAN_TRUE && compute_svd == QUDA_BOOLEAN_FALSE) return QUDA_DIRECT_PC_SOLVE; else if (use_norm_op == QUDA_BOOLEAN_TRUE && use_pc == QUDA_BOOLEAN_FALSE && compute_svd == QUDA_BOOLEAN_TRUE) @@ -37,8 +38,7 @@ bool skip_test(test_t test_param) // matpc // this is only legal for the staggered and asqtad op - if (!is_staggered(dslash_type)) - return true; + if (!is_staggered(dslash_type)) return true; // we can only compute the real part for Lanczos, and real or magnitude for Arnoldi switch (eig_type) { @@ -53,10 +53,9 @@ bool skip_test(test_t test_param) } } else if (combo_solve_type == QUDA_NORMOP_SOLVE) { // matdag_mat - + // this is only legal for the staggered and asqtad op - if (!is_staggered(dslash_type)) - return true; + if (!is_staggered(dslash_type)) return true; switch (eig_type) { case QUDA_EIG_TR_LANCZOS: @@ -64,22 +63,22 @@ bool skip_test(test_t test_param) if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true; break; case QUDA_EIG_IR_ARNOLDI: - //if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true; + // if (spectrum == QUDA_SPECTRUM_LI_EIG || spectrum == QUDA_SPECTRUM_SI_EIG) return true; return true; // we skip this because it takes an unnecessarily long time and it's covered elsewhere break; default: return true; break; } } else if (combo_solve_type == QUDA_DIRECT_SOLVE) { // mat - + switch (dslash_type) { case QUDA_STAGGERED_DSLASH: // only Arnoldi, imaginary part or magnitude works (real part is degenerate) // We skip SM because it takes an unnecessarily long time and it's // covered by HISQ if (eig_type != QUDA_EIG_IR_ARNOLDI) return true; - if (spectrum != QUDA_SPECTRUM_LI_EIG && spectrum != QUDA_SPECTRUM_SI_EIG && - spectrum != QUDA_SPECTRUM_LM_EIG) return true; + if (spectrum != QUDA_SPECTRUM_LI_EIG && spectrum != QUDA_SPECTRUM_SI_EIG && spectrum != QUDA_SPECTRUM_LM_EIG) + return true; break; case QUDA_ASQTAD_DSLASH: // only Arnoldi, imaginary part or magnitude works (real part is degenerate) @@ -150,7 +149,7 @@ auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG); auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG, QUDA_SPECTRUM_SM_EIG, QUDA_SPECTRUM_LI_EIG, QUDA_SPECTRUM_SI_EIG); -//using test_t = ::testing::tuple --solve-type direct-pc --solution-type mat --inv-type cg --matpc odd-odd\n"); printfQuda("--test 3 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even\n"); printfQuda("--test 4 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd\n"); - printfQuda("--test 5 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even --multishift 8\n"); - printfQuda("--test 6 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd --multishift 8\n"); + printfQuda( + "--test 5 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc even-even --multishift 8\n"); + printfQuda( + "--test 6 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd --multishift 8\n"); } GaugeField cpuFatQDP = {}; @@ -247,8 +249,8 @@ std::vector> solve(test_t param) // schwarz parameters auto schwarz_param = ::testing::get<6>(param); - inv_param.schwarz_type = ::testing::get<0>(schwarz_param); - inv_param.inv_type_precondition = ::testing::get<1>(schwarz_param); + inv_param.schwarz_type = ::testing::get<0>(schwarz_param); + inv_param.inv_type_precondition = ::testing::get<1>(schwarz_param); inv_param.cuda_prec_precondition = ::testing::get<2>(schwarz_param); inv_param.residual_type = ::testing::get<7>(param); @@ -282,7 +284,7 @@ std::vector> solve(test_t param) std::vector out_multishift(Nsrc * multishift); quda::ColorSpinorParam cs_param; constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param); - std::vector> _hp_multi_x(Nsrc, std::vector(multishift)); + std::vector> _hp_multi_x(Nsrc, std::vector(multishift)); // Staggered vector construct END //----------------------------------------------------------------------------------- @@ -370,7 +372,7 @@ std::vector> solve(test_t param) gflops[n] = inv_param.gflops / inv_param.secs; iter[n] = inv_param.iter; printfQuda("Done: %i iter / %g secs = %g Gflops\n\n", inv_param.iter, inv_param.secs, - inv_param.gflops / inv_param.secs); + inv_param.gflops / inv_param.secs); } } else { inv_param.num_src = Nsrc; @@ -392,7 +394,7 @@ std::vector> solve(test_t param) inv_param.gflops /= comm_size() / num_sub_partition; quda::comm_allreduce_max(inv_param.secs); printfQuda("Done: %d sub-partitions - %i iter / %g secs = %g Gflops\n\n", num_sub_partition, inv_param.iter, - inv_param.secs, inv_param.gflops / inv_param.secs); + inv_param.secs, inv_param.gflops / inv_param.secs); } // Free the multigrid solver @@ -408,7 +410,8 @@ std::vector> solve(test_t param) if (multishift > 1) { printfQuda("\nSource %d:\n", n); // Create an appropriate subset of the full out_multishift vector - std::vector out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift}; + std::vector out_subset + = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift}; res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param); } else { res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param); @@ -439,7 +442,8 @@ int main(int argc, char **argv) add_multigrid_option_group(app); add_comms_option_group(app); add_testing_option_group(app); - app->add_option("--legacy-test-info", print_legacy_info, "Print info on how to reproduce the old '--test #' behavior with flags, then exit"); + app->add_option("--legacy-test-info", print_legacy_info, + "Print info on how to reproduce the old '--test #' behavior with flags, then exit"); try { app->parse(argc, argv); } catch (const CLI::ParseError &e) { @@ -468,10 +472,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) - errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); - if (!is_staggered(dslash_type)) - errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); + if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } // Need to add support for LAPLACE MG? @@ -490,12 +492,24 @@ int main(int argc, char **argv) // the staggered tests will fail. These checks are designed to be consistent // with what's in [src]/tests/CMakeFiles.txt, which have been "sanity checked" bool changes = false; - if (!compute_fatlong) { compute_fatlong = true; changes = true; } + if (!compute_fatlong) { + compute_fatlong = true; + changes = true; + } double expected_tol = (prec == QUDA_SINGLE_PRECISION) ? 1e-5 : 1e-6; - if (tol != expected_tol) { tol = expected_tol; changes = true; } - if (tol_hq != expected_tol) { tol_hq = expected_tol; changes = true; } - if (niter != 1000) { niter = 1000; changes = true; } + if (tol != expected_tol) { + tol = expected_tol; + changes = true; + } + if (tol_hq != expected_tol) { + tol_hq = expected_tol; + changes = true; + } + if (niter != 1000) { + niter = 1000; + changes = true; + } if (changes) { printfQuda("For gtest, various defaults are changed:\n"); diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 27369b4a2f..a4e7bcda90 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -30,26 +30,29 @@ bool skip_test(test_t param) if (prec < prec_sloppy) return true; // outer precision >= sloppy precision if (!(QUDA_PRECISION & prec_sloppy)) return true; // precision not enabled so skip it if (!(QUDA_PRECISION & prec_precondition) && prec_precondition != QUDA_INVALID_PRECISION) - return true; // precision not enabled so skip it + return true; // precision not enabled so skip it if (prec_sloppy < prec_precondition) return true; // sloppy precision >= preconditioner precision // Skip if the inverter does not support batched update and batched update is greater than one if (!support_solution_accumulator_pipeline(inverter_type) && solution_accumulator_pipeline > 1) return true; // There's no MLocal or MdagMLocal support yet, this is left in for reference - //if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) + // if (is_normal_solve(param) && ::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) // if (dslash_type != QUDA_MOBIUS_DWF_DSLASH) return true; if (is_laplace(dslash_type)) { if (multishift > 1) return true; // Laplace doesn't support multishift - if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE) return true; // Laplace only supports direct solves + if (solution_type != QUDA_MAT_SOLUTION || solve_type != QUDA_DIRECT_SOLVE) + return true; // Laplace only supports direct solves } if (is_staggered(dslash_type)) { // the staggered and asqtad operators aren't HPD - if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type)) return true; + if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && is_hermitian_solver(inverter_type)) + return true; // MR struggles with the staggered and asqtad spectrum, it's not MR's fault - if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) return true; + if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) + return true; } // split-grid doesn't support multigrid at present @@ -87,19 +90,16 @@ TEST_P(StaggeredInvertTest, verify) if (solution_type == QUDA_MAT_SOLUTION) { if (solve_type == QUDA_DIRECT_PC_SOLVE) verify_tol /= (0.5 * mass); // to solve the full operator to eps, solve the preconditioned to mass * eps - if (solve_type == QUDA_NORMOP_SOLVE) - verify_tol /= (0.5 * mass); // a proxy for the condition number + if (solve_type == QUDA_NORMOP_SOLVE) verify_tol /= (0.5 * mass); // a proxy for the condition number } // The power iterations method of determining the Chebyshev window // breaks down due to the nature of the spectrum of the direct operator auto ca_basis_tmp = inv_param.ca_basis; - if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER) - inv_param.ca_basis = QUDA_POWER_BASIS; + if (solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_CA_GCR_INVERTER) inv_param.ca_basis = QUDA_POWER_BASIS; // Single precision needs a tiny bump due to small host/device precision deviations - if (prec == QUDA_SINGLE_PRECISION) - verify_tol *= 1.01; + if (prec == QUDA_SINGLE_PRECISION) verify_tol *= 1.01; for (auto rsd : solve(GetParam())) { if (res_t & QUDA_L2_RELATIVE_RESIDUAL) { EXPECT_LE(rsd[0], verify_tol); } @@ -136,14 +136,14 @@ using ::testing::Combine; using ::testing::Values; auto staggered_pc_solvers - = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER, QUDA_GCR_INVERTER, - QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); + = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER, + QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); auto normal_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_PCG_INVERTER); -auto direct_solvers - = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, - QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); +auto direct_solvers = Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER, QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, + QUDA_CA_CGNE_INVERTER, QUDA_CA_CGNR_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER, + QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); auto sloppy_precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION); @@ -166,8 +166,9 @@ INSTANTIATE_TEST_SUITE_P(EvenOdd, StaggeredInvertTest, // full system normal solve INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredInvertTest, - Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE), - sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION), + Values(QUDA_NORMOP_SOLVE), sloppy_precisions, Values(1), + solution_accumulator_pipelines, no_schwarz, no_heavy_quark), gettestname); // full system direct solve @@ -178,23 +179,22 @@ INSTANTIATE_TEST_SUITE_P(Full, StaggeredInvertTest, // preconditioned multi-shift solves INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest, - Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), - Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(10), - solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE), + sloppy_precisions, Values(10), solution_accumulator_pipelines, no_schwarz, + no_heavy_quark), gettestname); // Heavy-Quark preconditioned solves INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest, - Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), - Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1), - solution_accumulator_pipelines, no_schwarz, + Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE), + sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)), gettestname); // These are left in but commented out for future reference // Schwarz-preconditioned normal solves -//INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest, +// INSTANTIATE_TEST_SUITE_P(SchwarzNormal, StaggeredInvertTest, // Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION), // Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1), // solution_accumulator_pipelines, @@ -204,11 +204,10 @@ INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest, // gettestname); // Schwarz-preconditioned direct solves -//INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, StaggeredInvertTest, +// INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, StaggeredInvertTest, // Combine(Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE), // sloppy_precisions, Values(1), solution_accumulator_pipelines, // Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_MR_INVERTER, QUDA_CA_GCR_INVERTER), // Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)), // no_heavy_quark), // gettestname); - diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index 8530286dbd..17b12edcea 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -502,7 +502,8 @@ std::shared_ptr make_app(std::string app_description, std::string app_n ->transform(CLI::QUDACheckedTransformer(mass_normalization_map)); quda_app - ->add_option("--matpc", matpc_type, "Matrix preconditioning type (even-even (default), odd-odd, even-even-asym, odd-odd-asym)") + ->add_option("--matpc", matpc_type, + "Matrix preconditioning type (even-even (default), odd-odd, even-even-asym, odd-odd-asym)") ->transform(CLI::QUDACheckedTransformer(matpc_type_map)); quda_app->add_option("--msrc", Msrc, "Used for testing non-square block blas routines where nsrc defines the other dimension"); @@ -601,9 +602,9 @@ std::shared_ptr make_app(std::string app_description, std::string app_n "The pipeline length for fused solution accumulation (default 0, no pipelining)"); quda_app - ->add_option( - "--solution-type", solution_type, - "The solution we desire (mat (default for Wilson-type), mat-dag-mat, mat-pc (default for staggered-type), mat-pc-dag-mat-pc (default for Wilson-type multi-shift))") + ->add_option("--solution-type", solution_type, + "The solution we desire (mat (default for Wilson-type), mat-dag-mat, mat-pc (default for " + "staggered-type), mat-pc-dag-mat-pc (default for Wilson-type multi-shift))") ->transform(CLI::QUDACheckedTransformer(solution_type_map)); quda_app @@ -617,8 +618,9 @@ std::shared_ptr make_app(std::string app_description, std::string app_n ->expected(4); quda_app - ->add_option("--solve-type", solve_type, - "The type of solve to do (direct, direct-pc (default for staggered-type), normop, normop-pc (default for Wilson-type), normerr, normerr-pc)") + ->add_option( + "--solve-type", + solve_type, "The type of solve to do (direct, direct-pc (default for staggered-type), normop, normop-pc (default for Wilson-type), normerr, normerr-pc)") ->transform(CLI::QUDACheckedTransformer(solve_type_map)); quda_app ->add_option("--solver-ext-lib-type", solver_ext_lib, "Set external library for the solvers (default Eigen library)") @@ -759,9 +761,12 @@ void add_eigen_option_group(std::shared_ptr quda_app) opgroup->add_option("--eig-use-dagger", eig_use_dagger, "Solve the Mdag problem instead of M (MMdag if eig-use-normop == true) (default false)"); - opgroup->add_option("--eig-use-normop", eig_use_normop, - "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false for Wilson-type, true for staggered-type)"); - opgroup->add_option("--eig-use-pc", eig_use_pc, "Solve the Even-Odd preconditioned problem (default false for Wilson-type, true for staggered-type)"); + opgroup->add_option( + "--eig-use-normop", + eig_use_normop, "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false for Wilson-type, true for staggered-type)"); + opgroup->add_option( + "--eig-use-pc", eig_use_pc, + "Solve the Even-Odd preconditioned problem (default false for Wilson-type, true for staggered-type)"); opgroup->add_option("--eig-use-poly-acc", eig_use_poly_acc, "Use Chebyshev polynomial acceleration in the eigensolver"); } diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 15dae3b9d2..24659b23e1 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -339,7 +339,7 @@ bool is_normal_solve(QudaInverterType inv_type, QudaSolveType solve_type) bool is_hermitian_solver(QudaInverterType type) { - switch(type) { + switch (type) { case QUDA_CG_INVERTER: case QUDA_CA_CG_INVERTER: return true; default: return false; diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 9431b3ce67..24a8668e7d 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -41,7 +41,8 @@ extern QudaPrecision &cuda_prec_refinement_sloppy; extern QudaPrecision &cuda_prec_ritz; // Determine if the Laplace operator has been defined -constexpr bool is_enabled_laplace() { +constexpr bool is_enabled_laplace() +{ #ifdef QUDA_LAPLACE return true; #else diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp index 24eface277..85e7993ba5 100644 --- a/tests/utils/staggered_gauge_utils.cpp +++ b/tests/utils/staggered_gauge_utils.cpp @@ -31,8 +31,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat // Similarly, gauge links can only be built in single or double, so upscale the build precision // if neccessary. auto gauge_param = gauge_param_in; - if (gauge_param.cuda_prec < QUDA_SINGLE_PRECISION) - gauge_param.cuda_prec = QUDA_SINGLE_PRECISION; + if (gauge_param.cuda_prec < QUDA_SINGLE_PRECISION) gauge_param.cuda_prec = QUDA_SINGLE_PRECISION; gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; // probably irrelevant From 431c4ecf2884e7630d8b8b64655a1cf7d0bbe3f7 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 10 Jan 2024 09:06:46 -0800 Subject: [PATCH 53/53] Updated comments in TRLM to reflect code changes --- lib/eig_trlm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index d994fbc272..02d8c26246 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -174,7 +174,7 @@ namespace quda logQuda(QUDA_SUMMARIZE, "TRLM computed the requested %d vectors in %d restart steps and %d OP*x operations.\n", n_conv, restart_iter, iter); - // Dump all Ritz values and residua if using Chebyshev + // Dump all Ritz values and residua for (int i = 0; i < n_conv; i++) { logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]); }