diff --git a/.gitignore b/.gitignore index a60ac7da48..8e122f0a4e 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ include/jitify_options.hpp .tags* autom4te.cache/* .vscode +cmake/CPM_*.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 803f5dba41..512a7b9baa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,12 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32) message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.") endif() +# For now only we only support register tiles for the staggered dslash operators +set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels (staggered only)") +if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS) + message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS") +endif() + set(QUDA_PRECISION "14" CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)") @@ -275,6 +281,7 @@ mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F) mark_as_advanced(QUDA_MAX_MULTI_BLAS_N) mark_as_advanced(QUDA_MAX_MULTI_RHS) +mark_as_advanced(QUDA_MAX_MULTI_RHS_TILE) mark_as_advanced(QUDA_PRECISION) mark_as_advanced(QUDA_RECONSTRUCT) mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE) @@ -420,20 +427,12 @@ if(QUDA_DOWNLOAD_EIGEN) CPMAddPackage( NAME Eigen VERSION ${QUDA_EIGEN_VERSION} - URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2 + URL https://gitlab.com/libeigen/eigen/-/archive/e67c494cba7180066e73b9f6234d0b2129f1cdf5.tar.bz2 + URL_HASH SHA256=98d244932291506b75c4ae7459af29b1112ea3d2f04660686a925d9ef6634583 DOWNLOAD_ONLY YES SYSTEM YES) target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR}) install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE) - - # Eigen 3.4 needs to be patched on Neon with nvc++ - if (${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC") - set(CMAKE_PATCH_EIGEN OFF CACHE BOOL "Internal use only; do not modify") - if (NOT CMAKE_PATCH_EIGEN) - execute_process(COMMAND patch -N "${Eigen_SOURCE_DIR}/Eigen/src/Core/arch/NEON/Complex.h" "${CMAKE_SOURCE_DIR}/cmake/eigen34_neon.diff") - set(CMAKE_PATCH_EIGEN ON CACHE BOOL "Internal use only; do not modify" FORCE) - endif() - endif() else() # fall back to using find_package find_package(Eigen QUIET) diff --git a/cmake/eigen34_neon.diff b/cmake/eigen34_neon.diff deleted file mode 100644 index 7190ca7fa4..0000000000 --- a/cmake/eigen34_neon.diff +++ /dev/null @@ -1,8 +0,0 @@ -21c21 -< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML ---- -> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__ -393c393 -< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML ---- -> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__ diff --git a/include/blas_3d.h b/include/blas_3d.h new file mode 100644 index 0000000000..2616a8ac03 --- /dev/null +++ b/include/blas_3d.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include + +namespace quda +{ + + namespace blas3d + { + + // Local enum for the 3D copy type + enum class copyType { COPY_TO_3D, COPY_FROM_3D, SWAP_3D }; + + /** + @brief Extract / insert / swap a timeslice between a 4-d field and a 3-d field + @param[in] slice Which slice + @param[in] type Extracting a time slice (COPY_TO_3D) or + inserting a timeslice (COPY_FROM_3D) or swapping time slices + (SWAP_3D) + @param[in,out] x 3-d field + @param[in,out] y 4-d field + */ + void copy(int slice, copyType type, ColorSpinorField &x, ColorSpinorField &y); + + /** + @brief Swap the slice in two given fields + @param[in] slice The slice we wish to swap in the fields + @param[in,out] x Field whose slice we wish to swap + @param[in,out] y Field whose slice we wish to swap + */ + void swap(int slice, ColorSpinorField &x, ColorSpinorField &y); + + /** + @brief Compute a set of real-valued inner products , where each inner + product is restricted to a timeslice. + @param[out] result Vector of spatial inner products + @param[in] x Left vector field + @param[in] y Right vector field + */ + void reDotProduct(std::vector &result, const ColorSpinorField &x, const ColorSpinorField &y); + + /** + @brief Compute a set of complex-valued inner products , where each inner + product is restricted to a timeslice. + @param[out] result Vector of spatial inner products + @param[in] x Left vector field + @param[in] y Right vector field + */ + void cDotProduct(std::vector &result, const ColorSpinorField &a, const ColorSpinorField &b); + + /** + @brief Timeslice real-valued scaling of the field + @param[in] a Vector of scale factors (length = local temporary extent) + @param[in] x Field we we wish to scale + */ + void ax(std::vector &a, ColorSpinorField &x); + + /** + @brief Timeslice real-valued axpby computation + @param[in] a Vector of scale factors (length = local temporary extent) + @param[in] x Input field + @param[in] b Vector of scale factors (length = local temporary extent) + @param[in,out] y Field we are updating + */ + void axpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y); + + /** + @brief Timeslice complex-valued axpby computation + @param[in] a Vector of scale factors (length = local temporary extent) + @param[in] x Input field + @param[in] b Vector of scale factors (length = local temporary extent) + @param[in,out] y Field we are updating + */ + void caxpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y); + + } // namespace blas3d +} // namespace quda diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 18de570604..52106e24fa 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -280,6 +280,8 @@ namespace quda //! for deflation etc. if (is_composite) printfQuda("Number of elements = %d\n", composite_dim); } + + void change_dim(int D, int d) { x[D] = d; } }; struct DslashConstant; @@ -1054,6 +1056,38 @@ namespace quda */ void spinorDistanceReweight(ColorSpinorField &src, double alpha0, int t0); + /** + @brief Helper function for determining if the spin of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @return If spin is unique return the number of spins + */ + inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b) + { + int nSpin = 0; + if (a.Nspin() == b.Nspin()) + nSpin = a.Nspin(); + else + errorQuda("Spin %d %d do not match (%s:%d in %s())", a.Nspin(), b.Nspin(), file, line, func); + return nSpin; + } + + /** + @brief Helper function for determining if the spin of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @param[in] args List of additional fields to check spin on + @return If spins is unique return the number of spins + */ + template + inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b, + const Args &...args) + { + return Spin_(func, file, line, a, b) & Spin_(func, file, line, a, args...); + } + +#define checkSpin(...) Spin_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** @brief Helper function for determining if the preconditioning type of the fields is the same. diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index d21121565f..e8049bfb85 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -971,16 +971,19 @@ namespace quda norm_t *norm = nullptr; int norm_offset = 0; if constexpr (fixed) { - if constexpr (block_float) { + if constexpr (fixed && block_float && nColor == 3 && nSpin == 1 && nVec == 1) { + norm = v.norm; + norm_offset = parity * v.norm_offset + 4 * x_cb + 3; + } else if constexpr (block_float) { norm = v.norm; - norm_offset = v.norm_offset; + norm_offset = parity * v.norm_offset + x_cb; } else { scale = v.scale; scale_inv = v.scale_inv; } } return fieldorder_wrapper( - v.v, accessor.index(parity, x_cb, s, c, n, volumeCB), scale, scale_inv, norm, parity * norm_offset + x_cb); + v.v, accessor.index(parity, x_cb, s, c, n, volumeCB), scale, scale_inv, norm, norm_offset); } /** Returns the number of field colors */ diff --git a/include/comm_key.h b/include/comm_key.h index 6618068bb0..a9682ddc03 100644 --- a/include/comm_key.h +++ b/include/comm_key.h @@ -1,27 +1,33 @@ #pragma once +#include + namespace quda { struct CommKey { static constexpr int n_dim = 4; + array key = {0, 0, 0, 0}; - int array[n_dim] = {0, 0, 0, 0}; + constexpr int product() { return key[0] * key[1] * key[2] * key[3]; } - constexpr inline int product() { return array[0] * array[1] * array[2] * array[3]; } + constexpr int &operator[](int d) { return key[d]; } - constexpr inline int &operator[](int d) { return array[d]; } + constexpr const int &operator[](int d) const { return key[d]; } - constexpr inline const int &operator[](int d) const { return array[d]; } + constexpr auto data() { return key.data; } - constexpr inline int *data() { return array; } + constexpr auto data() const { return key.data; } - constexpr inline const int *data() const { return array; } + constexpr bool is_valid() const { return (key[0] > 0) && (key[1] > 0) && (key[2] > 0) && (key[3] > 0); } - constexpr inline bool is_valid() const + bool operator==(const CommKey &other) const { - return (array[0] > 0) && (array[1] > 0) && (array[2] > 0) && (array[3] > 0); + bool is_same = true; + for (auto i = 0; i < n_dim; i++) + if (key[i] != other.key[i]) is_same = false; + return is_same; } }; diff --git a/include/comm_quda.h b/include/comm_quda.h index 4341cf58d9..8f37bcd032 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -50,12 +50,26 @@ namespace quda int comm_dim(int dim); /** - Return the coording of this process in the dimension dim + Return the global number of processes in the dimension dim + @param dim Dimension which we are querying + @return Length of process dimensions + */ + int comm_dim_global(int dim); + + /** + Return the coordinate of this process in the dimension dim @param dim Dimension which we are querying @return Coordinate of this process */ int comm_coord(int dim); + /** + Return the global coordinates of this process in the dimension dim + @param dim Dimension which we are querying + @return Coordinate of this process + */ + int comm_coord_global(int dim); + /** * Declare a message handle for sending `nbytes` to the `rank` with `tag`. */ diff --git a/include/complex_quda.h b/include/complex_quda.h index 18da63def5..1481dec40f 100644 --- a/include/complex_quda.h +++ b/include/complex_quda.h @@ -360,23 +360,19 @@ struct complex typedef ValueType value_type; // Constructors - __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType()) + __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType()) { real(re); imag(im); } - template - __host__ __device__ - inline complex(const complex & z) + template __host__ __device__ inline complex(const complex &z) { real(z.real()); imag(z.imag()); } - template - __host__ __device__ - inline complex(const std::complex & z) + template __host__ __device__ inline complex(const std::complex &z) { real(z.real()); imag(z.imag()); @@ -436,12 +432,11 @@ struct complex template <> struct complex : public float2 { public: typedef float value_type; - complex() = default; - constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { } + complex() = default; + constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { } template - constexpr complex(const std::complex &z) : - float2 {static_cast(z.real()), static_cast(z.imag())} + constexpr complex(const std::complex &z) : float2 {static_cast(z.real()), static_cast(z.imag())} { } @@ -500,16 +495,15 @@ template <> struct complex : public float2 { template <> struct complex : public double2 { public: typedef double value_type; - complex() = default; - constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { } + complex() = default; + constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { } template - constexpr complex(const std::complex &z) : - double2 {static_cast(z.real()), static_cast(z.imag())} + constexpr complex(const std::complex &z) : double2 {static_cast(z.real()), static_cast(z.imag())} { } - template __host__ __device__ inline complex &operator=(const complex &z) + template __host__ __device__ inline complex &operator=(const complex &z) { real(z.real()); imag(z.imag()); @@ -572,9 +566,9 @@ template <> struct complex : public char2 { public: typedef int8_t value_type; - complex() = default; + complex() = default; - constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { } + constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { @@ -608,9 +602,9 @@ struct complex : public short2 public: typedef short value_type; - complex() = default; + complex() = default; - constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { } + constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { @@ -644,9 +638,9 @@ struct complex : public int2 public: typedef int value_type; - complex() = default; + complex() = default; - constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { } + constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { diff --git a/include/dslash.h b/include/dslash.h index 4b1aada52e..80638b494e 100644 --- a/include/dslash.h +++ b/include/dslash.h @@ -67,6 +67,10 @@ namespace quda if (arg.xpay) strcat(aux_base, ",xpay"); if (arg.dagger) strcat(aux_base, ",dagger"); setRHSstring(aux_base, in.size()); + strcat(aux_base, ",n_rhs_tile="); + char tile_str[16]; + i32toa(tile_str, Arg::n_src_tile); + strcat(aux_base, tile_str); } /** @@ -329,7 +333,13 @@ namespace quda Dslash(Arg &arg, cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const std::string &app_base = "") : - TunableKernel3D(in[0], halo.X(4), arg.nParity), arg(arg), out(out), in(in), halo(halo), nDimComms(4), dslashParam(arg) + TunableKernel3D(in[0], (halo.X(4) + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity), + arg(arg), + out(out), + in(in), + halo(halo), + nDimComms(4), + dslashParam(arg) { if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION) errorQuda("CPU Fields not supported in Dslash framework yet"); diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index df176549fc..0e49e64b68 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -12,12 +12,7 @@ #include #include -#if defined(_NVHPC_CUDA) -#include -constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE; -#else constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE; -#endif #include @@ -241,11 +236,12 @@ namespace quda return true; } - template struct DslashArg { + template struct DslashArg { using Float = Float_; using real = typename mapper::type; static constexpr int nDim = nDim_; + static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread const int parity; // only use this for single parity fields const int nParity; // number of parities we're working on @@ -269,6 +265,7 @@ namespace quda int threadDimMapLower[4]; int threadDimMapUpper[4]; + int_fastdiv n_src; int_fastdiv Ls; // these are set with symmetric preconditioned twisted-mass dagger @@ -327,6 +324,7 @@ namespace quda exterior_threads(0), threadDimMapLower {}, threadDimMapUpper {}, + n_src(in.size()), Ls(halo.X(4) / in.size()), twist_a(0.0), twist_b(0.0), @@ -650,8 +648,9 @@ namespace quda Arg arg; dslash_functor_arg(const Arg &arg, unsigned int threads_x) : - kernel_param(dim3(threads_x, arg.dc.Ls, arg.nParity)), - arg(arg) { } + kernel_param(dim3(threads_x, (arg.dc.Ls + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity)), arg(arg) + { + } }; /** diff --git a/include/dslash_quda.h b/include/dslash_quda.h index 38efe07e6c..32111e92f9 100644 --- a/include/dslash_quda.h +++ b/include/dslash_quda.h @@ -773,9 +773,12 @@ namespace quda @param[in] a Scale factor applied to derivative @param[in] b Scale factor applied to aux field @param[in] x Vector field we accumulate onto to + @param[in] parity Destination parity + @param[in] comm_override Override for which dimensions are partitioned + @param[in] profile The TimeProfile used for profiling the dslash */ void ApplyLaplace(cvector_ref &out, cvector_ref &in, const GaugeField &U, - int dir, double a, double b, cvector_ref &x, int parity, bool dagger, + int dir, double a, double b, cvector_ref &x, int parity, const int *comm_override, TimeProfile &profile); /** diff --git a/include/eigen_helper.h b/include/eigen_helper.h index 8c1bf012a8..373f9ed865 100644 --- a/include/eigen_helper.h +++ b/include/eigen_helper.h @@ -5,10 +5,6 @@ #define EIGEN_USE_BLAS #endif -#if defined(__NVCOMPILER) // WAR for nvc++ until we update to latest Eigen -#define EIGEN_DONT_VECTORIZE -#endif - #include // hide annoying warning diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h index ee01a51b32..4fcea4b3d3 100644 --- a/include/eigensolve_quda.h +++ b/include/eigensolve_quda.h @@ -148,7 +148,7 @@ namespace quda @param[in] out Output spinor @param[in] in Input spinor */ - double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in); + virtual double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in); /** @brief Orthogonalise input vectors r against @@ -263,22 +263,10 @@ namespace quda @brief Compute eigenvalues and their residiua @param[in] mat Matrix operator @param[in] evecs The eigenvectors - @param[in] evals The eigenvalues + @param[in,out] evals The eigenvalues @param[in] size The number of eigenvalues to compute */ - void computeEvals(std::vector &evecs, std::vector &evals, - int size); - - /** - @brief Compute eigenvalues and their residiua. This variant compute the number of converged eigenvalues. - @param[in] mat Matrix operator - @param[in] evecs The eigenvectors - @param[in] evals The eigenvalues - */ - void computeEvals(std::vector &evecs, std::vector &evals) - { - computeEvals(evecs, evals, n_conv); - } + virtual void computeEvals(std::vector &evecs, std::vector &evals, int size = 0); /** @brief Load and check eigenpairs from file @@ -461,6 +449,116 @@ namespace quda void computeBlockKeptRitz(std::vector &kSpace); }; + /** + @brief Thick Restarted Lanczos Method for 3D slices + */ + class TRLM3D : public EigenSolver + { + bool verbose_rank = false; /** Whether this rank is one that logs */ + + public: + /** + @brief Constructor for Thick Restarted Eigensolver class + @param eig_param The eigensolver parameters + @param mat The operator to solve + */ + TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param); + + /** + @return Whether the solver is only for Hermitian systems + */ + virtual bool hermitian() override { return true; } /** TRLM3D is only for Hermitian systems */ + + // Variable size matrix (for the 3D problem) + std::vector> ritz_mat_3D; + + // Arrays for 3D residua + std::vector> residua_3D; + + // Array for convergence + std::vector converged_3D; + std::vector active_3D; + std::vector iter_locked_3D; + std::vector iter_keep_3D; + std::vector iter_converged_3D; + std::vector num_locked_3D; + std::vector num_keep_3D; + std::vector num_converged_3D; + + // Tridiagonal/Arrow matrices, fixed size (for the 3D problem) + std::vector> alpha_3D; + std::vector> beta_3D; + + // The orthogonal direction and size in the 3D problem + int ortho_dim; + int ortho_dim_size; + + /** + @brief Compute eigenpairs + @param[in] kSpace Krylov vector space + @param[in] evals Computed eigenvalues + */ + void operator()(std::vector &kSpace, std::vector &evals) override; + + /** + @brief Lanczos step: extends the Krylov space. + @param[in] v Vector space + @param[in] j Index of vector being computed + */ + void lanczosStep3D(std::vector &v, int j); + + /** + @brief Reorder the Krylov space by eigenvalue + @param[in] kSpace the Krylov space + */ + void reorder3D(std::vector &kSpace); + + /** + @brief Get the eigendecomposition from the arrow matrix + */ + void eigensolveFromArrowMat3D(); + + /** + @brief Rotate the Ritz vectors using the arrow matrix eigendecomposition + @param[in] nKspace current Krylov space + */ + void computeKeptRitz3D(std::vector &kSpace); + + /** + @brief Orthogonalise input vectors r against + vector space v using block-BLAS + @param[in] v Vector space + @param[in] r Vectors to be orthogonalised + @param[in] j Use vectors v[0:j] + @param[in] s array of + */ + void blockOrthogonalize3D(std::vector &v, std::vector &r, int j); + + /** + @brief Check for an initial guess. If none present, populate with rands, then + orthonormalise + @param[in] kSpace The Krylov space vectors + */ + void prepareInitialGuess3D(std::vector &kSpace, int ortho_dim_size); + + /** + @brief Estimate the spectral radius of the operator for the max value of the + Chebyshev polynomial + @param[in] mat Matrix operator + @param[in] out Output spinor + @param[in] in Input spinor + */ + double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) override; + + /** + @brief Compute eigenvalues and their residiua + @param[in] evecs The eigenvectors + @param[in,out] evals The eigenvalues + @param[in] size The number of eigenvalues to compute + */ + void computeEvals(std::vector &evecs, std::vector &evals, int size = 0) override; + }; + /** @brief Implicitly Restarted Arnoldi Method. */ diff --git a/include/enum_quda.h b/include/enum_quda.h index f281d33feb..462ce98cf0 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -138,6 +138,7 @@ typedef enum QudaInverterType_s { typedef enum QudaEigType_s { QUDA_EIG_TR_LANCZOS, // Thick restarted lanczos solver QUDA_EIG_BLK_TR_LANCZOS, // Block Thick restarted lanczos solver + QUDA_EIG_TR_LANCZOS_3D, // Thick restarted lanczos solver for 3-d systems QUDA_EIG_IR_ARNOLDI, // Implicitly Restarted Arnoldi solver QUDA_EIG_BLK_IR_ARNOLDI, // Block Implicitly Restarted Arnoldi solver QUDA_EIG_INVALID = QUDA_INVALID_ENUM @@ -277,8 +278,6 @@ typedef enum QudaVerbosity_s { QUDA_INVALID_VERBOSITY = QUDA_INVALID_ENUM } QudaVerbosity; -typedef enum QudaTune_s { QUDA_TUNE_NO, QUDA_TUNE_YES, QUDA_TUNE_INVALID = QUDA_INVALID_ENUM } QudaTune; - typedef enum QudaPreserveDirac_s { QUDA_PRESERVE_DIRAC_NO, QUDA_PRESERVE_DIRAC_YES, diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 596232a7b3..8874959d7b 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -122,8 +122,9 @@ #define QudaEigType integer(4) #define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver #define QUDA_EIG_BLK_IR_LANCZOS 1 // Block Thick Restarted Lanczos Solver -#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver -#define QUDA_EIG_BLK_IR_ARNOLDI 3 // Block Implicitly restarted Arnoldi solver (not yet implemented) +#define QUDA_EIG_TR_LANCZOS_3D 2 // Thick Restarted Lanczos Solver for 3-d systems +#define QUDA_EIG_IR_ARNOLDI 3 // Implicitly restarted Arnoldi solver +#define QUDA_EIG_BLK_IR_ARNOLDI 4 // Block Implicitly restarted Arnoldi solver (not yet implemented) #define QUDA_EIG_INVALID QUDA_INVALID_ENUM #define QudaEigSpectrumType integer(4) @@ -247,11 +248,6 @@ #define QUDA_DEBUG_VERBOSE 3 #define QUDA_INVALID_VERBOSITY QUDA_INVALID_ENUM -#define QudaTune integer(4) -#define QUDA_TUNE_NO 0 -#define QUDA_TUNE_YES 1 -#define QUDA_TUNE_INVALID QUDA_INVALID_ENUM - #define QudaPreserveDirac integer(4) #define QUDA_PRESERVE_DIRAC_NO 0 #define QUDA_PRESERVE_DIRAC_YES 1 diff --git a/include/field_cache.h b/include/field_cache.h index 5e05f9acf6..4def6faa3a 100644 --- a/include/field_cache.h +++ b/include/field_cache.h @@ -15,7 +15,7 @@ namespace quda { */ template struct FieldKey { - std::string volume; /** volume kstring */ + std::string volume; /** volume string */ std::string aux; /** auxiliary string */ FieldKey() = default; @@ -78,6 +78,18 @@ namespace quda { */ FieldTmp(const FieldKey &key, const typename T::param_type ¶m); + /** + @brief Create a field temporary that corresponds to the field + constructed from the param struct. If a matching field is + present in the cache, it will be popped from the cache. If no + such temporary exists a temporary will be allocated. + @param[in] key Key corresponding to the field instance we + require + @param[in] param Parameter structure used to allocated + the temporary + */ + FieldTmp(typename T::param_type param); + /** @brief Copy constructor is deleted to prevent accidental cache bloat @@ -111,6 +123,18 @@ namespace quda { */ template auto getFieldTmp(const T &a) { return FieldTmp(a); } + /** + @brief Get a field temporary that is identical to the field + instance argument. If a matching field is present in the cache, + it will be popped from the cache. If no such temporary exists, a + temporary will be allocated. When the destructor for the + FieldTmp is called, e.g., the returned object goes out of scope, + the temporary will be pushed onto the cache. + + @param[in] a Field we wish to create a matching temporary for + */ + template auto getFieldTmp(const typename T::param_type ¶m) { return FieldTmp(param); } + /** @brief Get a vector of field temporaries that are identical to the vector instance argument. If enough matching fields are diff --git a/include/gauge_field.h b/include/gauge_field.h index a62b6a684f..776681e44b 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -250,12 +250,12 @@ namespace quda { */ void setTuningString(); + public: /** @brief Initialize the padded region to 0 */ void zeroPad(); - public: /** @brief Default constructor */ @@ -455,7 +455,7 @@ namespace quda { std::enable_if_t && !std::is_pointer_v::type>, T> data() const { if (is_pointer_array(order)) errorQuda("Non dim-array ordered field requested but order is %d", order); - return reinterpret_cast(gauge.data()); + return static_cast(gauge.data()); } /** @@ -473,7 +473,7 @@ namespace quda { "data() requires a pointer cast type"); if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry); if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); - return reinterpret_cast(gauge_array[d].data()); + return static_cast(gauge_array[d].data()); } void *raw_pointer() const @@ -500,7 +500,7 @@ namespace quda { { if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); array u = {}; - for (auto d = 0; d < geometry; d++) u[d] = static_cast(gauge_array[d]); + for (auto d = 0; d < geometry; d++) u[d] = static_cast(gauge_array[d].data()); return u; } @@ -651,9 +651,28 @@ namespace quda { } } + /** + * @brief Print the site data + * @param[in] parity Parity index + * @param[in] dim The dimension in which we are printing + * @param[in] x_cb Checkerboard space-time index + * @param[in] rank The rank we are requesting from (default is rank = 0) + */ + void PrintMatrix(int dim, int parity, unsigned int x_cb, int rank = 0) const; + friend struct GaugeFieldParam; }; + /** + @brief Print the value of the field at the requested coordinates + @param[in] a The field we are printing from + @param[in] dim The dimension in which we are printing + @param[in] parity Parity index + @param[in] x_cb Checkerboard space-time index + @param[in] rank The rank we are requesting from (default is rank = 0) + */ + void genericPrintMatrix(const GaugeField &a, int dim, int parity, unsigned int x_cb, int rank = 0); + /** @brief This is a debugging function, where we cast a gauge field into a spinor field so we can compute its L1 norm. diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index a8809d018f..57992908c2 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -153,8 +153,11 @@ namespace quda { template __host__ __device__ inline constexpr bool fixed_point() { return false; } template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } - template<> __host__ __device__ inline constexpr bool fixed_point() { return true; } - template<> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } template __host__ __device__ inline constexpr bool match() { return false; } template<> __host__ __device__ inline constexpr bool match() { return true; } @@ -377,7 +380,7 @@ namespace quda { { for (int d = 0; d < U.Geometry(); d++) u[d] = gauge_ ? static_cast **>(gauge_)[d] : U.data *>(d); - resetScale(U.Scale()); + resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax())); } void resetScale(Float max) @@ -459,7 +462,7 @@ namespace quda { ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); } - resetScale(U.Scale()); + resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax())); } void resetScale(Float max) @@ -494,7 +497,7 @@ namespace quda { scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - resetScale(U.Scale()); + resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax())); } void resetScale(Float max) @@ -583,7 +586,7 @@ namespace quda { ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); } - resetScale(U.Scale()); + resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax())); } void resetScale(Float max) @@ -636,7 +639,7 @@ namespace quda { scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - resetScale(U.Scale()); + resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax())); } void resetScale(Float max) @@ -715,7 +718,7 @@ namespace quda { nullptr; ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d); } - resetScale(U.Scale()); + resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax())); } void resetScale(Float max) @@ -745,8 +748,8 @@ namespace quda { @tparam nSpinCoarse Number of "spin degrees of freedom" (for coarse-link fields only) @tparam order Storage order of the field @tparam native_ghost Whether to use native ghosts (inlined into + the padded area for internal-order fields or use a separate array if false) @tparam storeFloat_ Underlying storage type for the field - the padded area for internal-order fields or use a separate array if false) */ template @@ -1478,7 +1481,7 @@ namespace quda { z *= scale; #pragma unroll for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]); - } else { // stagic phase + } else { // static phase #pragma unroll for (int i = 0; i < 9; i++) { out[i] *= phase; } } diff --git a/include/index_helper.cuh b/include/index_helper.cuh index 990ff949e7..cf1faf72b0 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -1106,4 +1106,26 @@ namespace quda { return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1; } + /** + @brief Compute the flattened 4-d index from a separate T and + flattened 3-d XYZ index. + @param[in] t Temporal index + @param[in] xyz Flattened 3-d index + @param[in] X Lattice dimensions + @return The flattened 4-d index + */ + template __device__ int idx_from_t_xyz(int t, int xyz, T X[4]) + { + int x[4]; +#pragma unroll + for (int d = 0; d < 4; d++) { + if (d != reduction_dim) { + x[d] = xyz % X[d]; + xyz = xyz / X[d]; + } + } + x[reduction_dim] = t; + return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]); + } + } // namespace quda diff --git a/include/kernels/blas_3d.cuh b/include/kernels/blas_3d.cuh new file mode 100644 index 0000000000..e7426e89af --- /dev/null +++ b/include/kernels/blas_3d.cuh @@ -0,0 +1,331 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace quda +{ + + struct baseArg : kernel_param<> { + int_fastdiv X[4]; // grid dimensions + + baseArg(dim3 threads, const ColorSpinorField &x) : kernel_param(threads) + { + for (int i = 0; i < 4; i++) { X[i] = x.X()[i]; } + if (x.SiteSubset() == 1) X[0] = 2 * X[0]; + } + }; + + template struct copy3dArg : baseArg { + using real = typename mapper::type; + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + F y; + F x; + const int slice; + + copy3dArg(ColorSpinorField &y, ColorSpinorField &x, int slice) : + baseArg(dim3(y.VolumeCB(), y.SiteSubset(), 1), y), y(y), x(x), slice(slice) + { + } + }; + + template struct copyTo3d { + const Arg &arg; + constexpr copyTo3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + // Isolate the slice of the 4D array + int idx[4]; + getCoords(idx, x_cb, arg.X, parity); + + if (idx[3] == arg.slice) { + using Vector = ColorSpinor; + + // Get 4D data + Vector y = arg.y(x_cb, parity); + // Get 3D location + int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]); + + // Write to 3D + arg.x(xyz / 2, xyz % 2) = y; + } + } + }; + + template struct copyFrom3d { + const Arg &arg; + constexpr copyFrom3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + int idx[4] = {}; + getCoords(idx, x_cb, arg.X, parity); + + if (idx[3] == arg.slice) { + using Vector = ColorSpinor; + + // Get 3D location + int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]); + Vector x = arg.x(xyz / 2, xyz % 2); + // Write to 4D + arg.y(x_cb, parity) = x; + } + } + }; + + template struct swap3d { + const Arg &arg; + constexpr swap3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + int idx[4] = {}; + getCoords(idx, x_cb, arg.X, parity); + + if (idx[3] == arg.slice) { + using Vector = ColorSpinor; + Vector x = arg.x(x_cb, parity); + Vector y = arg.y(x_cb, parity); + arg.x(x_cb, parity) = y; + arg.y(x_cb, parity) = x; + } + } + }; + + template struct axpby3dArg : baseArg { + using real = typename mapper::type; + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + static constexpr bool disable_ghost = true; + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + static constexpr int MAX_ORTHO_DIM = 256; + real a[MAX_ORTHO_DIM]; + const F x; + real b[MAX_ORTHO_DIM]; + F y; + + axpby3dArg(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y) : + baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y) + { + if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM); + for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i]; + for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i]; + } + }; + + template struct axpby3d { + const Arg &arg; + constexpr axpby3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + int idx[4]; + getCoords(idx, x_cb, arg.X, parity); + + Vector x = arg.x(x_cb, parity); + Vector y = arg.y(x_cb, parity); + + // Get the coeffs for this dim slice + real a = arg.a[idx[3]]; + real b = arg.b[idx[3]]; + + // Compute the axpby + Vector out = a * x + b * y; + + // Write to y + arg.y(x_cb, parity) = out; + } + }; + + template struct caxpby3dArg : baseArg { + using real = typename mapper::type; + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + static constexpr bool disable_ghost = true; + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + static constexpr int MAX_ORTHO_DIM = 256; + complex a[MAX_ORTHO_DIM]; + const F x; + complex b[MAX_ORTHO_DIM]; + F y; + + caxpby3dArg(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y) : + baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y) + { + if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM); + for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i]; + for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i]; + } + }; + + template struct caxpby3d { + const Arg &arg; + constexpr caxpby3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + int idx[4]; + getCoords(idx, x_cb, arg.X, parity); + + Vector x = arg.x(x_cb, parity); + Vector y = arg.y(x_cb, parity); + + // Get the coeffs for this dim slice + complex a = arg.a[idx[3]]; + complex b = arg.b[idx[3]]; + + // Compute the caxpby + Vector out = a * x + b * y; + + // Write to y + arg.y(x_cb, parity) = out; + } + }; + + template struct reDotProduct3dArg : public ReduceArg { + using real = typename mapper::type; + static constexpr int reduction_dim = 3; // DMH Template this + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + F x; + F y; + int_fastdiv Xh[4]; // checkerboard grid dimensions + + reDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) : + ReduceArg(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]), + x.X()[reduction_dim]), + x(x), + y(y) + { + for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i]; + } + + __device__ __host__ double init() const { return double(); } + }; + + template struct reDotProduct3d : plus { + using reduce_t = double; + using plus::operator(); + static constexpr int reduce_block_dim = 2; + const Arg &arg; + constexpr reDotProduct3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + // Collect vector data + int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh); + + Vector x = arg.x(idx_cb, parity); + Vector y = arg.y(idx_cb, parity); + + // Get the inner product + reduce_t sum = innerProduct(x, y).real(); + + // Apply reduction to t bucket + return plus::operator()(sum, result); + } + }; + + template struct cDotProduct3dArg : public ReduceArg> { + using real = typename mapper::type; + static constexpr int reduction_dim = 3; // DMH Template this + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + F x; + F y; + int_fastdiv X[4]; // grid dimensions + int_fastdiv Xh[4]; // checkerboard grid dimensions + + cDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) : + ReduceArg>(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]), + x.X()[reduction_dim]), + x(x), + y(y) + { + for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i]; + } + + __device__ __host__ reduce_t init() const { return {0.0, 0.0}; } + }; + + template struct cDotProduct3d : plus> { + using reduce_t = array; + using plus::operator(); + static constexpr int reduce_block_dim = 2; + + const Arg &arg; + constexpr cDotProduct3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + // Collect vector data + int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh); + + Vector x = arg.x(idx_cb, parity); + Vector y = arg.y(idx_cb, parity); + + // Get the inner product + complex res = innerProduct(x, y); + + // Apply reduction to temporal bucket + return plus::operator()({res.real(), res.imag()}, result); + } + }; +} // namespace quda diff --git a/include/kernels/contraction.cuh b/include/kernels/contraction.cuh index 96079ed041..815da471d2 100644 --- a/include/kernels/contraction.cuh +++ b/include/kernels/contraction.cuh @@ -32,20 +32,6 @@ namespace quda return ((sink[3] * X[2] + sink[2]) * X[1] + sink[1]) * X[0] + sink[0]; } - template __device__ int idx_from_t_xyz(int t, int xyz, T X[4]) - { - int x[4]; -#pragma unroll - for (int d = 0; d < 4; d++) { - if (d != reduction_dim) { - x[d] = xyz % X[d]; - xyz /= X[d]; - } - } - x[reduction_dim] = t; - return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]); - } - template struct ContractionSummedArg : public ReduceArg { using reduce_t = contract_t; diff --git a/include/kernels/copy_field_offset.cuh b/include/kernels/copy_field_offset.cuh index ea4a603bfd..5f614b2e49 100644 --- a/include/kernels/copy_field_offset.cuh +++ b/include/kernels/copy_field_offset.cuh @@ -142,12 +142,12 @@ namespace quda if (arg.mode == QudaOffsetCopyMode::COLLECT) { // we are collecting so x_cb is the index for the input. idx_in = x_cb; - getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.array); + getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.data()); idx_out = linkIndex(coordinate, arg.dim_out); } else { // we are dispersing so x_cb is the index for the output. idx_out = x_cb; - getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.array); + getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.data()); idx_in = linkIndex(coordinate, arg.dim_in); } copy_field(s * arg.volume_4d_cb_out + idx_out, s * arg.volume_4d_cb_in + idx_in, parity, arg); diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh index 799e511b44..62bf0d27d6 100644 --- a/include/kernels/dslash_domain_wall_m5.cuh +++ b/include/kernels/dslash_domain_wall_m5.cuh @@ -212,7 +212,7 @@ namespace quda template __device__ __host__ inline Vector d5(const Arg &arg, const Vector &in, int parity, int x_cb, int s, int src_idx) { - + int local_src_idx = target::thread_idx().y / arg.Ls; using real = typename Arg::real; constexpr bool is_variable = true; coeff_type coeff(arg); @@ -235,7 +235,7 @@ namespace quda const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb; HalfVector half_in; if (shared) { - half_in = cache.load(threadIdx.x, fwd_s, parity); + half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity); } else { Vector full_in = arg.in[src_idx](fwd_idx, parity); half_in = full_in.project(4, proj_dir); @@ -258,7 +258,7 @@ namespace quda const int back_idx = back_s * arg.volume_4d_cb + x_cb; HalfVector half_in; if (shared) { - half_in = cache.load(threadIdx.x, back_s, parity); + half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity); } else { Vector full_in = arg.in[src_idx](back_idx, parity); half_in = full_in.project(4, proj_dir); @@ -283,7 +283,8 @@ namespace quda { // forwards direction const int fwd_s = (s + 1) % arg.Ls; const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb; - const Vector in = shared ? cache.load(threadIdx.x, fwd_s, parity) : arg.in[src_idx](fwd_idx, parity); + const Vector in + = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity) : arg.in[src_idx](fwd_idx, parity); constexpr int proj_dir = dagger ? +1 : -1; if (s == arg.Ls - 1) { out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir); @@ -295,7 +296,8 @@ namespace quda { // backwards direction const int back_s = (s + arg.Ls - 1) % arg.Ls; const int back_idx = back_s * arg.volume_4d_cb + x_cb; - const Vector in = shared ? cache.load(threadIdx.x, back_s, parity) : arg.in[src_idx](back_idx, parity); + const Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity) : + arg.in[src_idx](back_idx, parity); constexpr int proj_dir = dagger ? -1 : +1; if (s == 0) { out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir); @@ -378,6 +380,7 @@ namespace quda __device__ __host__ inline Vector constantInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_, int src_idx) { + int local_src_idx = target::thread_idx().y / arg.Ls; using real = typename Arg::real; const auto k = arg.kappa; const auto inv = arg.inv; @@ -395,7 +398,8 @@ namespace quda for (int s = 0; s < arg.Ls; s++) { - Vector in = shared ? cache.load(threadIdx.x, s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); + Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : + arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); { int exp = s_ < s ? arg.Ls - s + s_ : s_ - s; @@ -436,6 +440,7 @@ namespace quda __device__ __host__ inline Vector variableInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_, int src_idx) { + int local_src_idx = target::thread_idx().y / arg.Ls; constexpr int nSpin = 4; using real = typename Arg::real; typedef ColorSpinor HalfVector; @@ -461,7 +466,7 @@ namespace quda auto factorR = (s_ < s ? -arg.m_f * R : R); if (shared) { - r += factorR * cache.load(threadIdx.x, s, parity); + r += factorR * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity); } else { Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); r += factorR * in.project(4, proj_dir); @@ -489,7 +494,7 @@ namespace quda auto factorL = (s_ > s ? -arg.m_f * L : L); if (shared) { - l += factorL * cache.load(threadIdx.x, s, parity); + l += factorL * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity); } else { Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); l += factorL * in.project(4, proj_dir); @@ -518,7 +523,8 @@ namespace quda for (int s_count = 0; s_count < arg.Ls; s_count++) { auto factorR = (s_ < s ? -arg.m_f * R : R); - Vector in = shared ? cache.load(threadIdx.x, s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); + Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : + arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); r += factorR * in.project(4, proj_dir); R *= coeff.kappa(s); @@ -537,7 +543,8 @@ namespace quda for (int s_count = 0; s_count < arg.Ls; s_count++) { auto factorL = (s_ > s ? -arg.m_f * L : L); - Vector in = shared ? cache.load(threadIdx.x, s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); + Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : + arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); l += factorL * in.project(4, proj_dir); L *= coeff.kappa(s); diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh index 94e0465238..a6a5b3c1d5 100644 --- a/include/kernels/dslash_pack.cuh +++ b/include/kernels/dslash_pack.cuh @@ -11,8 +11,10 @@ namespace quda { int *getPackComms(); - template + constexpr int pack_tile_size = 1; + + template struct PackArg : kernel_param<> { typedef Float_ Float; @@ -24,6 +26,7 @@ namespace quda static constexpr bool dagger = dagger_; static constexpr int twist = twist_; // whether we are doing preconditioned twisted-mass or not (1 - singlet, 2 - doublet) static constexpr QudaPCType pc_type = pc_type_; // preconditioning type (4-d or 5-d) + static constexpr int n_src_tile = n_src_tile_; static constexpr bool spinor_direct_load = false; // false means texture load @@ -54,6 +57,7 @@ namespace quda int sites_per_block; + int_fastdiv n_src; int_fastdiv Ls; char *packBuffer[4 * QUDA_MAX_DIM]; @@ -96,6 +100,7 @@ namespace quda threadDimMapLower {}, threadDimMapUpper {}, sites_per_block((work_items + grid - 1) / grid), + n_src(in.size()), Ls(halo.X(4) / in.size()) #ifdef NVSHMEM_COMMS , @@ -131,7 +136,7 @@ namespace quda } }; - template + template __device__ __host__ inline void pack(const Arg &arg, int ghost_idx, int s, int parity, int src_idx) { typedef typename mapper::type real; @@ -160,46 +165,52 @@ namespace quda int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? +1 : -1; - Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity); - if (twist == 1) { - f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); - } else if (twist == 2) { - Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor - if (s == 0) - f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); - else - f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); - } - if (arg.spin_project) { - arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) - = f.project(dim, proj_dir); - } else { - arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); + if (twist == 1) { + f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); + } else if (twist == 2) { + Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor + if (s == 0) + f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); + else + f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); + } + if (arg.spin_project) { + arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) + = f.project(dim, proj_dir); + } else { + arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } else { // forwards int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? -1 : +1; - Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity); - if (twist == 1) { - f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); - } else if (twist == 2) { - Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor - if (s == 0) - f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); - else - f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); - } - if (arg.spin_project) { - arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) - = f.project(dim, proj_dir); - } else { - arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); + if (twist == 1) { + f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); + } else if (twist == 2) { + Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor + if (s == 0) + f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); + else + f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); + } + if (arg.spin_project) { + arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) + = f.project(dim, proj_dir); + } else { + arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } } - template + template __device__ __host__ inline void packStaggered(const Arg &arg, int ghost_idx, int parity, int src_idx) { typedef typename mapper::type real; @@ -217,12 +228,18 @@ namespace quda if (face_num == 0) { // backwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg); - Vector f = arg.in[src_idx](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 0, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 0, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } else { // forwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg); - Vector f = arg.in[src_idx](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 1, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 1, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } @@ -250,24 +267,36 @@ namespace quda if (Arg::pc_type == QUDA_5D_PC) { // 5-d checkerboarded, include s (not ghostFaceCB since both faces) switch (dim) { case 0: - pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, + parity, src_idx); break; case 1: - pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, + parity, src_idx); break; case 2: - pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, + parity, src_idx); break; case 3: - pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, + parity, src_idx); break; } } else { // 4-d checkerboarding, keeping s separate (if it exists) switch (dim) { - case 0: pack(arg, ghost_idx, s, parity, src_idx); break; - case 1: pack(arg, ghost_idx, s, parity, src_idx); break; - case 2: pack(arg, ghost_idx, s, parity, src_idx); break; - case 3: pack(arg, ghost_idx, s, parity, src_idx); break; + case 0: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 1: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 2: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 3: + pack(arg, ghost_idx, s, parity, src_idx); + break; } } @@ -288,7 +317,8 @@ namespace quda // 64 - use uber kernel (merge exterior) template struct packShmem { - template __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity) + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity) { // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx int local_block_idx = target::block_idx().x % arg.blocks_per_dir; @@ -315,9 +345,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[0]) { int ghost_idx = dir * arg.dc.ghostFaceCB[0] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -325,9 +355,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[1]) { int ghost_idx = dir * arg.dc.ghostFaceCB[1] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -335,9 +365,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[2]) { int ghost_idx = dir * arg.dc.ghostFaceCB[2] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -345,9 +375,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[3]) { int ghost_idx = dir * arg.dc.ghostFaceCB[3] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -358,12 +388,19 @@ namespace quda #endif } - __device__ __forceinline__ void operator()(const Arg &arg, int s, int parity, int twist_pack) + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_s_block_, int parity, int twist_pack) { - switch (twist_pack) { - case 0: this->operator()<0>(arg, s, parity); break; - case 1: this->operator()<1>(arg, s, parity); break; - case 2: this->operator()<2>(arg, s, parity); break; + int src_s_block = MAX_MULTI_RHS == 1 ? 0 : src_s_block_; + int src_s_idx = src_s_block * Arg::n_src_tile; + if (src_s_idx + n_src_tile * arg.Ls <= arg.n_src * arg.Ls) { + switch (twist_pack) { + case 0: this->operator()<0, n_src_tile>(arg, src_s_idx, parity); break; + case 1: this->operator()<1, n_src_tile>(arg, src_s_idx, parity); break; + case 2: this->operator()<2, n_src_tile>(arg, src_s_idx, parity); break; + } + } else if constexpr (n_src_tile - 1 > 0) { + operator()(arg, src_s_block, parity); } } }; @@ -377,7 +414,7 @@ namespace quda { if (arg.nParity == 1) parity = arg.parity; packShmem pack; - pack.operator()(arg, s, parity); + pack.operator()(arg, s, parity, Arg::twist); } }; @@ -400,17 +437,17 @@ namespace quda if (arg.nFace == 1) { switch (dim) { - case 0: packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); break; - case 1: packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); break; - case 2: packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); break; - case 3: packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); break; + case 0: packStaggered<0, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 1: packStaggered<1, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 2: packStaggered<2, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 3: packStaggered<3, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; } } else if (arg.nFace == 3) { switch (dim) { - case 0: packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); break; - case 1: packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); break; - case 2: packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); break; - case 3: packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); break; + case 0: packStaggered<0, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 1: packStaggered<1, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 2: packStaggered<2, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 3: packStaggered<3, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; } } @@ -422,7 +459,7 @@ namespace quda template struct packStaggeredShmem { - __device__ __forceinline__ void operator()(const Arg &arg, int src_idx, int parity, int = 0) + template __device__ __forceinline__ void apply(const Arg &arg, int src_idx, int parity) { // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx int local_block_idx = target::block_idx().x % arg.blocks_per_dir; @@ -446,9 +483,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[0]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[0] + local_tid; if (arg.nFace == 1) - packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<0, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<0, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -456,9 +493,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[1]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[1] + local_tid; if (arg.nFace == 1) - packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<1, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<1, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -466,9 +503,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[2]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[2] + local_tid; if (arg.nFace == 1) - packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<2, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<2, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -476,9 +513,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[3]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[3] + local_tid; if (arg.nFace == 1) - packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<3, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<3, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -488,6 +525,18 @@ namespace quda if (arg.shmem) shmem_signal(dim, dir, arg); #endif } + + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_idx_block_, int parity, int = 0) + { + int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_; + int src_idx = src_idx_block * Arg::n_src_tile; + if (src_idx + n_src_tile <= arg.n_src) { + apply(arg, src_idx, parity); + } else if constexpr (n_src_tile - 1 > 0) { + operator()(arg, src_idx_block, parity); + } + } }; template struct pack_staggered_shmem { diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index 7eafa4a687..ec09ab1622 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -14,9 +14,9 @@ namespace quda /** @brief Parameter structure for driving the Staggered Dslash operator */ - template - struct StaggeredArg : DslashArg { + template + struct StaggeredArg : DslashArg { typedef typename mapper::type real; static constexpr int nColor = nColor_; static constexpr int nSpin = 1; @@ -37,7 +37,6 @@ namespace quda using GL = typename gauge_mapper::type; - const int_fastdiv n_src; F out[MAX_MULTI_RHS]; /** output vector field */ F in[MAX_MULTI_RHS]; /** input vector field */ const Ghost halo_pack; /** accessor for writing the halo */ @@ -57,8 +56,8 @@ namespace quda StaggeredArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a, cvector_ref &x, int parity, bool dagger, const int *comm_override) : - DslashArg(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, - spin_project, comm_override), + DslashArg(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, + improved_ ? 3 : 1, spin_project, comm_override), halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), @@ -87,9 +86,9 @@ namespace quda @param[in] parity The site parity @param[in] x_cb The checkerboarded site index */ - template - __device__ __host__ inline void applyStaggered(Vector &out, const Arg &arg, Coord &coord, int parity, int, - int thread_dim, bool &active, int src_idx) + template + __device__ __host__ inline void applyStaggered(array &out, const Arg &arg, Coord &coord, + int parity, int, int thread_dim, bool &active, int src_idx) { typedef typename mapper::type real; typedef Matrix, Arg::nColor> Link; @@ -104,13 +103,20 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); - Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(U, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in + = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(U, in, out[s]); + } } else if (doBulk() && !ghost) { const int fwd_idx = linkIndexP1(coord, arg.dim, d); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); - Vector in = arg.in[src_idx](fwd_idx, their_spinor_parity); - out = mv_add(U, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity); + out[s] = mv_add(U, in, out[s]); + } } } @@ -120,14 +126,20 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace); const Link L = arg.L(d, coord.x_cb, parity); - const Vector in - = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(L, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in + = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(L, in, out[s]); + } } else if (doBulk() && !ghost) { const int fwd3_idx = linkIndexP3(coord, arg.dim, d); const Link L = arg.L(d, coord.x_cb, parity); - const Vector in = arg.in[src_idx](fwd3_idx, their_spinor_parity); - out = mv_add(L, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity); + out[s] = mv_add(L, in, out[s]); + } } } @@ -140,15 +152,22 @@ namespace quda const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2; const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) : arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)); - Vector in = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(conj(U), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in + = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(conj(U), -in, out[s]); + } } else if (doBulk() && !ghost) { const int back_idx = linkIndexM1(coord, arg.dim, d); const int gauge_idx = back_idx; const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) : arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg)); - Vector in = arg.in[src_idx](back_idx, their_spinor_parity); - out = mv_add(conj(U), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity); + out[s] = mv_add(conj(U), -in, out[s]); + } } } @@ -159,15 +178,21 @@ namespace quda // when updating replace arg.nFace with 1 here const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1); const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity); - const Vector in - = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(conj(L), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in + = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(conj(L), -in, out[s]); + } } else if (doBulk() && !ghost) { const int back3_idx = linkIndexM3(coord, arg.dim, d); const int gauge_idx = back3_idx; const Link L = arg.L(d, gauge_idx, 1 - parity); - const Vector in = arg.in[src_idx](back3_idx, their_spinor_parity); - out = mv_add(conj(L), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity); + out[s] = mv_add(conj(L), -in, out[s]); + } } } } // nDim @@ -181,8 +206,8 @@ namespace quda constexpr staggered(const Arg &arg) : arg(arg) {} static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation - template - __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity) + template + __device__ __host__ __forceinline__ void apply(int idx, int src_idx, int parity) { using real = typename mapper::type; using Vector = ColorSpinor; @@ -195,20 +220,41 @@ namespace quda const int my_spinor_parity = nParity == 2 ? parity : 0; - Vector out; + array out; + applyStaggered(out, arg, coord, parity, idx, thread_dim, active, src_idx); - applyStaggered(out, arg, coord, parity, idx, thread_dim, active, src_idx); - - out *= arg.dagger_scale; +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale; if (xpay && mykernel_type == INTERIOR_KERNEL) { - Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity); - out = arg.a * x - out; +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = arg.a * x - out[s]; + } } else if (mykernel_type != INTERIOR_KERNEL) { - Vector x = arg.out[src_idx](coord.x_cb, my_spinor_parity); - out = x + (xpay ? -out : out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = x + (xpay ? -out[s] : out[s]); + } + } + if (mykernel_type != EXTERIOR_KERNEL_ALL || active) { +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; } + } + } + + template + __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block_, int parity) + { + int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_; + int src_idx = src_idx_block * Arg::n_src_tile; + if (src_idx + n_src_tile <= arg.n_src) { + apply(idx, src_idx, parity); + } else if constexpr (n_src_tile - 1 > 0) { + operator()(idx, src_idx_block, parity); } - if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out; } }; diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh index 514c4df957..584999d52f 100644 --- a/include/kernels/dslash_wilson.cuh +++ b/include/kernels/dslash_wilson.cuh @@ -34,7 +34,6 @@ namespace quda typedef typename mapper::type real; - const int_fastdiv n_src; F out[MAX_MULTI_RHS]; /** output vector field set */ F in[MAX_MULTI_RHS]; /** input vector field set */ F x[MAX_MULTI_RHS]; /** input vector set when doing xpay */ diff --git a/include/kernels/evec_project.cuh b/include/kernels/evec_project.cuh index 7bea55d908..add4b7caba 100644 --- a/include/kernels/evec_project.cuh +++ b/include/kernels/evec_project.cuh @@ -47,25 +47,11 @@ namespace quda { __device__ __host__ spinor_array init() const { return spinor_array(); } }; - template __device__ int idx_from_t_xyz(int t, int xyz, T X[4]) - { - int x[4]; -#pragma unroll - for (int d = 0; d < 4; d++) { - if (d != reduction_dim) { - x[d] = xyz % X[d]; - xyz = xyz / X[d]; - } - } - x[reduction_dim] = t; - return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]); - } - template struct EvecProjection : plus { using reduce_t = spinor_array; - using plus::operator(); - static constexpr int reduce_block_dim = 1; // only doing a reduct in the x thread dimension - + using plus::operator(); + static constexpr int reduce_block_dim = 1; // only doing a reduce in the x thread dimension + const Arg &arg; constexpr EvecProjection(const Arg &arg) : arg(arg) {} static constexpr const char *filename() { return KERNEL_FILE; } diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh index d0846de352..a3c2b1b377 100644 --- a/include/kernels/laplace.cuh +++ b/include/kernels/laplace.cuh @@ -44,8 +44,8 @@ namespace quda LaplaceArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const GaugeField &U, int dir, double a, double b, - cvector_ref &x, int parity, bool dagger, const int *comm_override) : - DslashArg(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override), + cvector_ref &x, int parity, const int *comm_override) : + DslashArg(out, in, halo, U, x, parity, false, a != 0.0 ? true : false, 1, false, comm_override), halo_pack(halo), halo(halo), U(U), @@ -74,7 +74,7 @@ namespace quda @param[in] thread_dim Which dimension this thread corresponds to (fused exterior only) */ - template + template __device__ __host__ inline void applyLaplace(Vector &out, Arg &arg, Coord &coord, int parity, int, int thread_dim, bool &active, int src_idx) { @@ -136,15 +136,16 @@ namespace quda } // out(x) = M*in - template struct laplace : dslash_default { + template struct laplace : dslash_default { const Arg &arg; constexpr laplace(const Arg &arg) : arg(arg) {} static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation template - __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity) + __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_, int parity) { + int src_idx = MAX_MULTI_RHS == 1 ? 0 : src_idx_; using real = typename mapper::type; using Vector = ColorSpinor; @@ -163,12 +164,10 @@ namespace quda // case 4 is an operator in all x,y,z,t dimensions // case 3 is a spatial operator only, the t dimension is omitted. switch (arg.dir) { - case 3: - applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); - break; + case 3: applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); break; case 4: default: - applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); + applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); break; } diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh index 22323e66a2..85e1790318 100644 --- a/include/kernels/staggered_quark_smearing.cuh +++ b/include/kernels/staggered_quark_smearing.cuh @@ -179,9 +179,9 @@ namespace quda if (arg.is_t0_kernel) { if (arg.t0 < 0) return; - if (mykernel_type == INTERIOR_KERNEL) { + if constexpr (mykernel_type == INTERIOR_KERNEL) { idx += arg.t0 * arg.t0_offset; - } else if (mykernel_type != EXTERIOR_KERNEL_ALL) { + } else if constexpr (mykernel_type != EXTERIOR_KERNEL_ALL) { if (idx >= arg.t0_face_size[mykernel_type]) idx += arg.face_size[mykernel_type] - arg.t0_face_size[mykernel_type]; idx += arg.t0 * arg.t0_face_offset[mykernel_type]; diff --git a/include/lattice_field.h b/include/lattice_field.h index 462add3bde..2cf1f6beaa 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -902,6 +902,40 @@ namespace quda { #define checkPrecision(...) Precision_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** + @brief Helper function for determining if the color of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @return If color is unique return the number of colors + */ + template + inline int Color_(const char *func, const char *file, int line, const T1 &a_, const T2 &b_) + { + const unwrap_t &a(a_); + const unwrap_t &b(b_); + int nColor = 0; + if (a.Ncolor() == b.Ncolor()) + nColor = a.Ncolor(); + else + errorQuda("Color %d %d do not match (%s:%d in %s())", a.Ncolor(), b.Ncolor(), file, line, func); + return nColor; + } + + /** + @brief Helper function for determining if the color of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @param[in] args List of additional fields to check color on + @return If colors is unique return the number of colors + */ + template + inline int Color_(const char *func, const char *file, int line, const T1 &a, const T2 &b, const Args &...args) + { + return Color_(func, file, line, a, b) & Color_(func, file, line, a, args...); + } + +#define checkColor(...) Color_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** @brief Helper function for determining if the field is in native order @param[in] a Input field diff --git a/include/quda.h b/include/quda.h index 51d0b98043..1ef14401f3 100644 --- a/include/quda.h +++ b/include/quda.h @@ -9,6 +9,7 @@ * as the Fortran interface in lib/quda_fortran.F90. */ +#include // bool support #include #include /* for FILE */ #include @@ -283,8 +284,6 @@ extern "C" { double temp; /**< The mean temperature of the device for the duration of the solve */ double clock; /**< The mean clock frequency of the device for the duration of the solve */ - QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */ - /** Number of steps in s-step algorithms */ int Nsteps; @@ -496,7 +495,7 @@ extern "C" { QudaBoolean preserve_deflation; /** This is where we store the deflation space. This will point - to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ + to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ void *preserve_deflation_space; /** If we restore the deflation space, this boolean indicates @@ -506,6 +505,10 @@ extern "C" { false, but preserve_deflation would be true */ QudaBoolean preserve_evals; + /** Whether to use the smeared gauge field for the Dirac operator + for whose eigenvalues are are computing. */ + bool use_smeared_gauge; + /** What type of Dirac operator we are using **/ /** If !(use_norm_op) && !(use_dagger) use M. **/ /** If use_dagger, use Mdag **/ @@ -569,6 +572,12 @@ extern "C" { /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/ char QUDA_logfile[512]; + /** The orthogonal direction in the 3D eigensolver **/ + int ortho_dim; + + /** The size of the orthogonal direction in the 3D eigensolver, local **/ + int ortho_dim_size_local; + //------------------------------------------------- // EIG-CG PARAMS @@ -1488,7 +1497,7 @@ extern "C" { * @param inGauge Pointer to the device gauge field (QUDA device field) * @param param The parameters of the host and device fields */ - void saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param); + void saveGaugeFieldQuda(void *outGauge, void *inGauge, QudaGaugeParam *param); /** * Reinterpret gauge as a pointer to a GaugeField and call destructor. diff --git a/include/quda_define.h.in b/include/quda_define.h.in index c17b361b56..a9622c8ea1 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -25,6 +25,12 @@ */ #define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@ +/** + * @def MAX_MULTI_BLAS_N + * @brief This macro sets the register tile size for MRHS kernels + */ +#define MAX_MULTI_RHS_TILE @QUDA_MAX_MULTI_RHS_TILE@ + #cmakedefine QUDA_HETEROGENEOUS_ATOMIC #ifdef QUDA_HETEROGENEOUS_ATOMIC /** @@ -36,6 +42,16 @@ #undef QUDA_HETEROGENEOUS_ATOMIC #endif +#cmakedefine QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +#ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +/** + * @def HETEROGENEOUS_ATOMIC_INF_INIT + * @brief This macro sets whether we are using infinity for the signaling sentinel + */ +#define HETEROGENEOUS_ATOMIC_INF_INIT +#undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +#endif + #cmakedefine QUDA_LARGE_KERNEL_ARG #cmakedefine QUDA_DIRAC_CLOVER_HASENBUSCH diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h index da3e178bcf..aceb88c83d 100644 --- a/include/targets/cuda/reduce_helper.h +++ b/include/targets/cuda/reduce_helper.h @@ -11,12 +11,17 @@ #include #include #include -#include +#include using count_t = cuda::atomic; namespace quda { +// By default we use negative infinity as the sentinel for testing for +// reduction completion. On some compilers we may need to use finite +// numbers, so the alternative approach uses negative zero (set with CMake +// option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT). +#ifdef HETEROGENEOUS_ATOMIC_INF_INIT /** @brief The initialization value we used to check for completion */ @@ -28,6 +33,31 @@ namespace quda */ template constexpr T terminate_value() { return cuda::std::numeric_limits::infinity(); } + /** + @brief Test if the result is complete (e.g., is not equal to the sentinel) + */ + template bool is_complete(const T &result) { return result != init_value(); } +#else + /** + @brief The initialization value we used to check for completion + */ + template constexpr T init_value() { return -static_cast(0.0); } + + /** + @brief The termination value we use to prevent a possible hang in + case the computed reduction is equal to the initialization + */ + template constexpr T terminate_value() { return static_cast(0.0); } + + /** + @brief Test if the result is complete (e.g., is not equal to the sentinel) + */ + template bool is_complete(const T &result) + { + return !(result == static_cast(0.0) && std::signbit(result)); + } +#endif + // declaration of reduce function template __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0); @@ -129,7 +159,7 @@ namespace quda if (consumed) errorQuda("Cannot call complete more than once for each construction"); for (int i = 0; i < n_reduce * n_item; i++) { - while (result_h[i].load(cuda::std::memory_order_relaxed) == init_value()) { } + while (!is_complete(result_h[i].load(cuda::std::memory_order_relaxed))) { } } // copy back result element by element and convert if necessary to host reduce type @@ -169,11 +199,7 @@ namespace quda auto tid = target::thread_idx_linear<2>(); if (arg.result_d) { // write to host mapped memory -#ifdef _NVHPC_CUDA // WAR for nvc++ - constexpr bool coalesced_write = false; -#else constexpr bool coalesced_write = true; -#endif if constexpr (coalesced_write) { static_assert(n <= device::warp_size(), "reduction array is greater than warp size"); auto mask = __ballot_sync(0xffffffff, tid < n); diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h deleted file mode 100644 index 1c4d7f3244..0000000000 --- a/include/targets/cuda/thread_array.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#ifndef _NVHPC_CUDA - -#include "../generic/thread_array.h" - -#else - -#include - -namespace quda -{ - template struct thread_array : array { - static constexpr unsigned int shared_mem_size(dim3 block) { return 0; } - }; -} // namespace quda - -#endif diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h deleted file mode 100644 index 24f986cc26..0000000000 --- a/include/targets/hip/thread_array.h +++ /dev/null @@ -1 +0,0 @@ -#include "../generic/thread_array.h" diff --git a/include/tune_quda.h b/include/tune_quda.h index 03b9b148ee..b49ccc9753 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -43,6 +43,14 @@ namespace quda { */ const std::map &getTuneCache(); + /** + @brief Unify all instances of the tunecache across ranks. This + is called after returning to a global communicator. + @param[in] rank_list The list of ranks from whose tunecaches we + want to merge to form the global tunecache + */ + void joinTuneCache(const std::vector &rank_list); + /** @brief Return a string encoding the QUDA version */ @@ -61,7 +69,7 @@ namespace quda { class Tunable { - friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity); + friend TuneParam tuneLaunch(Tunable &, bool, QudaVerbosity); static inline uint64_t _flops_global = 0; static inline uint64_t _bytes_global = 0; @@ -417,7 +425,7 @@ namespace quda { * @param[in] verbosity What verbosity to use during tuning? * @return The tuned launch parameters */ - TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled = getTuning(), QudaVerbosity verbosity = getVerbosity()); + TuneParam tuneLaunch(Tunable &tunable, bool enabled = getTuning(), QudaVerbosity verbosity = getVerbosity()); /** * @brief Post an event in the trace, recording where it was posted diff --git a/include/util_quda.h b/include/util_quda.h index 3d68fb5a2e..031f878762 100644 --- a/include/util_quda.h +++ b/include/util_quda.h @@ -20,7 +20,25 @@ namespace quda @brief Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_ENABLE_TUNING=0. @return If autotuning is enabled */ -QudaTune getTuning(); +bool getTuning(); + +/** + @brief Set the tuning state + @param[in] tuning New tuning state + */ +void setTuning(bool tuning); + +/** + @brief Push a new tuning state, and back up the present one on the + stack. + @param[in] tune New tuning state + */ +void pushTuning(bool tune); + +/** + @brief Pop the present tuning state and restore what is on the stack. + */ +void popTuning(); QudaVerbosity getVerbosity(); char *getOutputPrefix(); diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 017341adc5..d88ba6a239 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -20,8 +20,9 @@ set (QUDA_OBJS solve.cpp monitor.cpp dirac_coarse.cpp dslash_coarse.cpp coarse_op.cpp coarsecoarse_op.cpp coarse_op_preconditioned.cpp staggered_coarse_op.cpp - eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp vector_io.cpp - eigensolve_quda.cpp quda_arpack_interface.cpp + eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp + eig_trlm_3d.cpp blas_3d.cu + vector_io.cpp eigensolve_quda.cpp quda_arpack_interface.cpp multigrid.cpp transfer.cpp block_orthogonalize.cpp prolongator.cpp restrictor.cpp staggered_prolong_restrict.cu gauge_phase.cu timer.cpp diff --git a/lib/blas_3d.cu b/lib/blas_3d.cu new file mode 100644 index 0000000000..acab3da85e --- /dev/null +++ b/lib/blas_3d.cu @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include +#include + +namespace quda +{ + + namespace blas3d + { + + template class copy3D : TunableKernel2D + { + ColorSpinorField &y; + ColorSpinorField &x; + const int t_slice; + const copyType type; + unsigned int minThreads() const { return y.VolumeCB(); } + + public: + copy3D(ColorSpinorField &y, ColorSpinorField &x, int t_slice, copyType type) : + TunableKernel2D(y, y.SiteSubset()), y(y), x(x), t_slice(t_slice), type(type) + { + // Check slice value + if (t_slice < 0 || t_slice >= y.X()[3]) errorQuda("Unexpected slice %d", t_slice); + + strcat(aux, type == copyType::SWAP_3D ? ",swap_3d" : type == copyType::COPY_TO_3D ? ",to_3d" : ",from_3d"); + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + copy3dArg arg(y, x, t_slice); + switch (type) { + case copyType::COPY_TO_3D: launch(tp, stream, arg); break; + case copyType::COPY_FROM_3D: launch(tp, stream, arg); break; + case copyType::SWAP_3D: launch(tp, stream, arg); break; + default: errorQuda("Unknown 3D copy type"); + } + } + + void preTune() + { + x.backup(); + y.backup(); + } + void postTune() + { + x.restore(); + y.restore(); + } + long long bytes() const + { + return (type == copyType::SWAP_3D ? 2 : 1) * (x.Bytes() / x.X(3) + y.Bytes() / y.X(3)); + } + }; + + void copy(const int slice, const copyType type, ColorSpinorField &x, ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + // Check orth dim + if (x.X()[3] != 1) errorQuda("Unexpected dimensions in x[3]=%d", x.X()[3]); + // We must give a 4D Lattice field as the first argument + instantiate(y, x, slice, type); + } + + void swap(int slice, ColorSpinorField &x, ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + instantiate(x, y, slice, copyType::SWAP_3D); + } + + template class axpby3D : TunableKernel2D + { + const ColorSpinorField &x; + ColorSpinorField &y; + const std::vector &a; + const std::vector &b; + unsigned int minThreads() const override { return x.VolumeCB(); } + + public: + axpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector &a, const std::vector &b) : + TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) override + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + launch(tp, stream, axpby3dArg(a, x, b, y)); + } + + void preTune() override { y.backup(); } + void postTune() override { y.restore(); } + long long flops() const override { return 6 * x.Volume() * x.Nspin() * x.Ncolor(); } + long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); } + }; + + void axpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, a, b); + } + + void ax(const std::vector &a, ColorSpinorField &x) + { + std::vector zeros(a.size(), 0.0); + axpby(a, x, zeros, x); + } + + template class caxpby3D : TunableKernel2D + { + const ColorSpinorField &x; + ColorSpinorField &y; + const std::vector &a; + const std::vector &b; + unsigned int minThreads() const override { return x.VolumeCB(); } + + public: + caxpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector &a, + const std::vector &b) : + TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) override + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + launch(tp, stream, caxpby3dArg(a, x, b, y)); + } + + void preTune() override { y.backup(); } + void postTune() override { y.restore(); } + long long flops() const override { return 14 * x.Volume() * x.Nspin() * x.Ncolor(); } + long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); } + }; + + void caxpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, a, b); + } + + template class reDotProduct3D : TunableMultiReduction + { + const ColorSpinorField &x; + const ColorSpinorField &y; + std::vector &result; + + public: + reDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector &result) : + TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + reDotProduct3dArg arg(x, y); + launch(result, tp, stream, arg); + } + + long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 2; } + long long bytes() const { return x.Bytes() + y.Bytes(); } + }; + + void reDotProduct(std::vector &result, const ColorSpinorField &x, const ColorSpinorField &y) + { + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (result.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, result); + } + + template class cDotProduct3D : TunableMultiReduction + { + const ColorSpinorField &x; + const ColorSpinorField &y; + std::vector &result; + + public: + cDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector &result) : + TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + cDotProduct3dArg arg(x, y); + launch(result, tp, stream, arg); + } + + long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 8; } + long long bytes() const { return x.Bytes() + y.Bytes(); } + }; + + void cDotProduct(std::vector &result, const ColorSpinorField &x, const ColorSpinorField &y) + { + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (result.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, result); + } + + } // namespace blas3d + +} // namespace quda diff --git a/lib/check_params.h b/lib/check_params.h index 068c9a4d3e..07a639df09 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -175,6 +175,7 @@ void printQudaEigParam(QudaEigParam *param) { P(preserve_deflation, QUDA_BOOLEAN_FALSE); P(preserve_deflation_space, 0); P(preserve_evals, QUDA_BOOLEAN_TRUE); + P(use_smeared_gauge, false); P(use_dagger, QUDA_BOOLEAN_FALSE); P(use_norm_op, QUDA_BOOLEAN_FALSE); P(compute_svd, QUDA_BOOLEAN_FALSE); @@ -578,6 +579,7 @@ void printQudaInvertParam(QudaInvertParam *param) { // domain decomposition parameters //P(inv_type_sloppy, QUDA_INVALID_INVERTER); // disable since invalid means no preconditioner #if defined INIT_PARAM + P(dslash_type_precondition, QUDA_INVALID_DSLASH); P(inv_type_precondition, QUDA_INVALID_INVERTER); P(preconditioner, 0); P(tol_precondition, INVALID_DOUBLE); diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu index ef370eda18..71664614cf 100644 --- a/lib/color_spinor_util.in.cu +++ b/lib/color_spinor_util.in.cu @@ -329,7 +329,7 @@ namespace quda { printf("rank = %d x = %u, s = %d, { ", comm_rank(), x_cb, s); for (int c = 0; c < o.Ncolor(); c++) { auto value = complex(o(parity, x_cb, s, c)); - printf("(%f,%f) ", value.real(), value.imag()); + printf("(%g,%g) ", value.real(), value.imag()); } printf("}\n"); } @@ -378,8 +378,6 @@ namespace quda { void genericPrintVector(const ColorSpinorField &a, int parity, unsigned int x_cb, int rank) { - if (rank != comm_rank()) return; - ColorSpinorParam param(a); param.location = QUDA_CPU_FIELD_LOCATION; param.create = QUDA_COPY_FIELD_CREATE; @@ -388,6 +386,8 @@ namespace quda { std::unique_ptr clone_a = !host_clone ? nullptr : std::make_unique(param); const ColorSpinorField &a_ = !host_clone ? a : *clone_a.get(); + if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang + switch (a.Precision()) { case QUDA_DOUBLE_PRECISION: genericPrintVector(a_, parity, x_cb); break; case QUDA_SINGLE_PRECISION: genericPrintVector(a_, parity, x_cb); break; diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp index 16d6f46c17..8bc24808ea 100644 --- a/lib/communicator_mpi.cpp +++ b/lib/communicator_mpi.cpp @@ -52,6 +52,7 @@ namespace quda } comm_init(nDim, commDims, rank_from_coords, map_data); + globalReduce.push(true); } @@ -120,6 +121,9 @@ namespace quda grid_size, size); } + // defer handling MPI errors to QUDA + MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN); + comm_init_common(ndim, dims, rank_from_coords, map_data); } diff --git a/lib/communicator_qmp.cpp b/lib/communicator_qmp.cpp index 1a4a2a7878..67d4a8694f 100644 --- a/lib/communicator_qmp.cpp +++ b/lib/communicator_qmp.cpp @@ -166,6 +166,9 @@ void Communicator::comm_init(int ndim, const int *dims, QudaCommsMap rank_from_c grid_size, QMP_comm_get_number_of_nodes(QMP_COMM_HANDLE)); } + // defer handling MPI errors to QMP + MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN); + comm_init_common(ndim, dims, rank_from_coords, map_data); } diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp index ef5db59a0a..adfd332f2b 100644 --- a/lib/communicator_stack.cpp +++ b/lib/communicator_stack.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace quda { @@ -49,8 +50,16 @@ namespace quda void push_communicator(const CommKey &split_key) { if (comm_nvshmem_enabled()) - errorQuda( - "Split-grid is currently not supported with NVSHMEM. Please set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM."); + errorQuda("Split-grid is currently not supported with NVSHMEM. Set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM."); + + // if reverting to global, we will need to join the tunecaches + bool join_tune_cache = split_key == default_comm_key; + int local_rank = comm_rank(); + int local_tune_rank = 0; + + // used to store the size of the tunecache at the point of splitting + static size_t tune_cache_size = 0; + auto search = communicator_stack.find(split_key); if (search == communicator_stack.end()) { communicator_stack.emplace(std::piecewise_construct, std::forward_as_tuple(split_key), @@ -59,7 +68,38 @@ namespace quda LatticeField::freeGhostBuffer(); // Destroy the (IPC) Comm buffers with the old communicator. + auto split_key_old = current_key; current_key = split_key; + + // we are returning to global so we need to join any diverged tunecaches + if (join_tune_cache) { + // has this tunecache been updated? + int tune_cache_update = tune_cache_size != getTuneCache().size(); + // have any of tunecaches across the split grid been updated? + comm_allreduce_int(tune_cache_update); + + if (tune_cache_update) { + auto num_sub_partition = split_key_old.product(); // the number of caches we need to merge + int sub_partition_dims[] = {comm_dim(0) / split_key_old[0], comm_dim(1) / split_key_old[1], + comm_dim(2) / split_key_old[2], comm_dim(3) / split_key_old[3]}; + int sub_partition_coords[] = {comm_coord(0) / sub_partition_dims[0], comm_coord(1) / sub_partition_dims[1], + comm_coord(2) / sub_partition_dims[2], comm_coord(3) / sub_partition_dims[3]}; + auto sub_partition_idx = sub_partition_coords[split_key_old.n_dim - 1]; + for (auto d = split_key_old.n_dim - 2; d >= 0; d--) + sub_partition_idx = sub_partition_idx * split_key_old[d] + sub_partition_coords[d]; + + std::vector global_tune_ranks(num_sub_partition); + for (auto i = 0; i < num_sub_partition; i++) { + global_tune_ranks[i] = (sub_partition_idx == i && local_tune_rank == local_rank) ? comm_rank() : 0; + comm_allreduce_int(global_tune_ranks[i]); + } + // we now have a list of all the global tune ranks so we can join them + joinTuneCache(global_tune_ranks); + } + } else if (!join_tune_cache) { + // record size of tunecache when first splitting the grid + tune_cache_size = getTuneCache().size(); + } } #if defined(QMP_COMMS) || defined(MPI_COMMS) @@ -70,8 +110,12 @@ namespace quda int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); } + int comm_dim_global(int dim) { return get_default_communicator().comm_dim(dim); } + int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); } + int comm_coord_global(int dim) { return get_default_communicator().comm_coord(dim); } + int comm_rank_from_coords(const int *coords) { return get_current_communicator().comm_rank_from_coords(coords); } void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, bool user_set_comm_handle, diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp index 8ce5acf133..b388d4c001 100644 --- a/lib/dslash_coarse.in.cpp +++ b/lib/dslash_coarse.in.cpp @@ -94,9 +94,9 @@ namespace quda IntList<@QUDA_MULTIGRID_NVEC_LIST@>()); } else { constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder); - ColorSpinorField v_inB = create_color_spinor_copy(inB, csOrder); - ColorSpinorField v_out = create_color_spinor_copy(out, csOrder); + auto v_inA = create_color_spinor_copy(inA, csOrder); + auto v_inB = create_color_spinor_copy(inB, csOrder); + auto v_out = create_color_spinor_copy(out, csOrder); if (dslash) { BlockTransposeForward(v_inA, inA); } if (clover) { BlockTransposeForward(v_inB, inB); } diff --git a/lib/dslash_pack2.cu b/lib/dslash_pack2.cu index 043ea65abc..01b70f3284 100644 --- a/lib/dslash_pack2.cu +++ b/lib/dslash_pack2.cu @@ -139,6 +139,10 @@ namespace quda strcpy(aux, "policy_kernel,"); strcat(aux, in.AuxString().c_str()); setRHSstring(aux, in.size()); + strcat(aux, ",n_rhs_tile="); + char tile_str[16]; + i32toa(tile_str, pack_tile_size); + strcat(aux, tile_str); char comm[5]; for (int i = 0; i < 4; i++) comm[i] = (comm_dim_pack[i] ? '1' : '0'); comm[4] = '\0'; diff --git a/lib/eig_trlm_3d.cpp b/lib/eig_trlm_3d.cpp new file mode 100644 index 0000000000..7480a58395 --- /dev/null +++ b/lib/eig_trlm_3d.cpp @@ -0,0 +1,670 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace quda +{ + // Thick Restarted Lanczos Method constructor + TRLM3D::TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param) : EigenSolver(mat, eig_param) + { + getProfile().TPSTART(QUDA_PROFILE_INIT); + + ortho_dim = eig_param->ortho_dim; + ortho_dim_size = eig_param->ortho_dim_size_local; + if (ortho_dim != 3) + errorQuda("Only 3D spatial splitting (ortho_dim = 3) is supported, ortho_dim passed = %d", ortho_dim); + verbose_rank = comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0; + + // Tridiagonal/Arrow matrices + alpha_3D.resize(ortho_dim_size); + beta_3D.resize(ortho_dim_size); + + residua_3D.resize(ortho_dim_size); + ritz_mat_3D.resize(ortho_dim_size); + converged_3D.resize(ortho_dim_size, false); + active_3D.resize(ortho_dim_size, false); + + iter_locked_3D.resize(ortho_dim_size, 0); + iter_keep_3D.resize(ortho_dim_size, 0); + iter_converged_3D.resize(ortho_dim_size, 0); + + num_locked_3D.resize(ortho_dim_size, 0); + num_keep_3D.resize(ortho_dim_size, 0); + num_converged_3D.resize(ortho_dim_size, 0); + + for (int i = 0; i < ortho_dim_size; i++) { + alpha_3D[i].resize(n_kr, 0.0); + beta_3D[i].resize(n_kr, 0.0); + residua_3D[i].resize(n_kr, 0.0); + } + + // 3D thick restart specific checks + if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6); + + if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) { + errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver"); + } + + getProfile().TPSTOP(QUDA_PROFILE_INIT); + } + + void TRLM3D::operator()(std::vector &kSpace, std::vector &evals) + { + // Create 3-d split communicators + CommKey split_key = {1, 1, 1, comm_dim(3)}; + + if (!split_key.is_valid()) { + errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]); + } + logQuda(QUDA_DEBUG_VERBOSE, "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n", + comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]); + push_communicator(split_key); + + // Override any user input for block size. + block_size = 1; + + // Pre-launch checks and preparation + //--------------------------------------------------------------------------- + queryPrec(kSpace[0].Precision()); + // Check to see if we are loading eigenvectors + if (strcmp(eig_param->vec_infile, "") != 0) { + printfQuda("Loading evecs from file name %s\n", eig_param->vec_infile); + loadFromFile(kSpace, evals); + return; + } + + // Check for an initial guess. If none present, populate with rands, then + // orthonormalise + prepareInitialGuess3D(kSpace, ortho_dim_size); + + // Increase the size of kSpace passed to the function, will be trimmed to + // original size before exit. + prepareKrylovSpace(kSpace, evals); + + // Check for Chebyshev maximum estimation + checkChebyOpMax(kSpace); + + // Convergence and locking criteria + std::vector mat_norm_3D(ortho_dim_size, 0.0); + double epsilon = setEpsilon(kSpace[0].Precision()); + + // Print Eigensolver params + printEigensolverSetup(); + //--------------------------------------------------------------------------- + + // Begin TRLM Eigensolver computation + //--------------------------------------------------------------------------- + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + int t_offset = ortho_dim_size * comm_coord(3); + + // Loop over restart iterations. + while (restart_iter < max_restarts && !converged) { + + // Get min step + int step_min = *std::min_element(num_locked_3D.begin(), num_locked_3D.end()); + comm_allreduce_min(step_min); + + for (int step = step_min; step < n_kr; step++) lanczosStep3D(kSpace, step); + iter += (n_kr - step_min); + + // The eigenvalues are returned in the alpha array + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + eigensolveFromArrowMat3D(); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + + // mat_norm is updated and used for LR + for (int t = 0; t < ortho_dim_size; t++) + for (int i = num_locked_3D[t]; i < n_kr; i++) + if (fabs(alpha_3D[t][i]) > mat_norm_3D[t]) mat_norm_3D[t] = fabs(alpha_3D[t][i]); + + // Lambda that returns mat_norm for LR and returns the relevant alpha + // (the corresponding Ritz value) for SR + auto check_norm = [&](double sr_norm, int t) -> double { + if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG) + return mat_norm_3D[t]; + else + return sr_norm; + }; + + // Locking check + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + iter_locked_3D[t] = 0; + for (int i = 0; i < (n_kr - num_locked_3D[t]); i++) { + if (residua_3D[t][i + num_locked_3D[t]] < epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) { + logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d %d resid=%+.6e condition=%.6e ****\n", t, i, + residua_3D[t][i + num_locked_3D[t]], epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)); + iter_locked_3D[t] = i + 1; + } else { + // Unlikely to find new locked pairs + break; + } + } + } + } + + // Convergence check + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + iter_converged_3D[t] = iter_locked_3D[t]; + for (int i = iter_locked_3D[t]; i < n_kr - num_locked_3D[t]; i++) { + if (residua_3D[t][i + num_locked_3D[t]] < tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) { + logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d %d resid=%+.6e condition=%.6e ****\n", t, i, + residua_3D[t][i + num_locked_3D[t]], tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)); + iter_converged_3D[t] = i + 1; + } else { + // Unlikely to find new converged pairs + break; + } + } + } + } + + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) + iter_keep_3D[t] + = std::min(iter_converged_3D[t] + (n_kr - num_converged_3D[t]) / 2, n_kr - num_locked_3D[t] - 12); + } + + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + computeKeptRitz3D(kSpace); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + + int min_nconv = n_kr + 1; + int min_nlock = n_kr + 1; + int min_nkeep = n_kr + 1; + + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + num_converged_3D[t] = num_locked_3D[t] + iter_converged_3D[t]; + num_keep_3D[t] = num_locked_3D[t] + iter_keep_3D[t]; + num_locked_3D[t] += iter_locked_3D[t]; + + if (num_converged_3D[t] < min_nconv) min_nconv = num_converged_3D[t]; + if (num_locked_3D[t] < min_nlock) min_nlock = num_locked_3D[t]; + if (num_keep_3D[t] < min_nkeep) min_nkeep = num_keep_3D[t]; + + // Use printf to get data from t dim only + if (getVerbosity() >= QUDA_DEBUG_VERBOSE && verbose_rank) { + printf("%04d converged eigenvalues for timeslice %d at restart iter %04d\n", num_converged_3D[t], + t_offset + t, restart_iter + 1); + printf("iter Conv[%d] = %d\n", t_offset + t, iter_converged_3D[t]); + printf("iter Keep[%d] = %d\n", t_offset + t, iter_keep_3D[t]); + printf("iter Lock[%d] = %d\n", t_offset + t, iter_locked_3D[t]); + printf("num_converged[%d] = %d\n", t_offset + t, num_converged_3D[t]); + printf("num_keep[%d] = %d\n", t_offset + t, num_keep_3D[t]); + printf("num_locked[%d] = %d\n", t_offset + t, num_locked_3D[t]); + for (int i = 0; i < n_kr; i++) { + printf("Ritz[%d][%d] = %.16e residual[%d] = %.16e\n", t_offset + t, i, alpha_3D[t][i], i, residua_3D[t][i]); + } + } + } + } + + // Check for convergence + bool all_converged = true; + for (int t = 0; t < ortho_dim_size; t++) { + if (num_converged_3D[t] >= n_conv) { + converged_3D[t] = true; + } else { + all_converged = false; + } + } + + if (getVerbosity() >= QUDA_VERBOSE && verbose_rank) { + std::stringstream converged; + std::copy(converged_3D.begin(), converged_3D.end(), std::ostream_iterator(converged, "")); + printf("iter = %d rank = %d converged = %s min nlock %3d nconv %3d nkeep %3d\n", restart_iter + 1, + comm_rank_global(), converged.str().c_str(), min_nlock, min_nconv, min_nkeep); + } + + if (all_converged) { + reorder3D(kSpace); + converged = true; + } + + restart_iter++; + } + + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + + // Post computation report + //--------------------------------------------------------------------------- + if (!converged) { + if (eig_param->require_convergence) { + errorQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d " + "restart steps. Exiting.", + n_conv, n_ev, n_kr, max_restarts); + } else { + warningQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d " + "restart steps. Continuing with current Lanczos factorisation.", + n_conv, n_ev, n_kr, max_restarts); + // Compute eigenvalues + computeEvals(kSpace, evals); + } + } else { + if (verbose_rank) { + if (getVerbosity() >= QUDA_SUMMARIZE) { + printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n", + comm_rank_global(), n_conv, restart_iter, iter); + } + + if (getVerbosity() >= QUDA_VERBOSE) { + // Dump all Ritz values and residua if using Chebyshev + if (eig_param->use_poly_acc) { + for (int t = 0; t < ortho_dim_size; t++) { + for (int i = 0; i < n_conv; i++) { + printf("RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0, + residua_3D[t][i]); + } + } + } + } + } + + // Compute eigenvalues + computeEvals(kSpace, evals); + } + + push_communicator(default_comm_key); + + // ensure all processes have all e-values + comm_allreduce_sum(evals); + + // Local clean-up + cleanUpEigensolver(kSpace, evals); + } + + // Thick Restart 3D Member functions + //--------------------------------------------------------------------------- + void TRLM3D::lanczosStep3D(std::vector &v, int j) + { + // Compute r[t] = A[t] * v_j[t] - b_{j-i}[t] * v_{j-1}[t] + // r[t] = A[t] * v_j[t] + + // Use this while we have only axpby in place of axpy; + std::vector unit(ortho_dim_size, 1.0); + std::vector alpha_j(ortho_dim_size, 0.0); + std::vector beta_j(ortho_dim_size, 0.0); + + // 3D vectors that hold data for individual t components + ColorSpinorParam csParamClone(v[0]); + csParamClone.change_dim(ortho_dim, 1); + std::vector vecs_t(1, csParamClone); + ColorSpinorField r_t(csParamClone); + + // Identify active 3D slices. The active_3D array + // should be modified here only throughout the entire + // algorithm. + for (int t = 0; t < ortho_dim_size; t++) { + // Every element of the active array must be assessed + active_3D[t] = (num_keep_3D[t] <= j && !converged_3D[t]) ? true : false; + } + + // This will be a blocked operator with no + // connections in the ortho_dim (usually t) + // hence the 3D sections of each vector + // will be independent. + chebyOp(r[0], v[j]); + + // a_j[t] = v_j^dag[t] * r[t] + blas3d::reDotProduct(alpha_j, v[j], r[0]); + for (int t = 0; t < ortho_dim_size; t++) { + // Only active problem data is recorded + if (active_3D[t]) alpha_3D[t][j] = alpha_j[t]; + } + + // r[t] = r[t] - a_j[t] * v_j[t] + for (int t = 0; t < ortho_dim_size; t++) alpha_j[t] = active_3D[t] ? -alpha_j[t] : 0.0; + blas3d::axpby(alpha_j, v[j], unit, r[0]); + + // r[t] = r[t] - b_{j-1}[t] * v_{j-1}[t] + // Only orthogonalise active problems + for (int t = 0; t < ortho_dim_size; t++) { + if (active_3D[t]) { + int start = (j > num_keep_3D[t]) ? j - 1 : 0; + if (j - start > 0) { + + // Ensure we have enough 3D vectors + if ((int)vecs_t.size() < j - start) resize(vecs_t, j - start, csParamClone); + + // Copy the 3D data into the 3D vectors, create beta array, and create + // pointers to the 3D vectors + std::vector beta_t; + beta_t.reserve(j - start); + // Copy residual + blas3d::copy(t, blas3d::copyType::COPY_TO_3D, r_t, r[0]); + // Copy vectors + for (int i = start; i < j; i++) { + blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i - start], v[i]); + beta_t.push_back(-beta_3D[t][i]); + } + + // r[t] = r[t] - beta[t]{j-1} * v[t]{j-1} + blas::block::axpy(beta_t, {vecs_t.begin(), vecs_t.begin() + j - start}, r_t); + + // Copy residual back to 4D vector + blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, r_t, r[0]); + } + } + } + + // Orthogonalise r against the Krylov space + for (int k = 0; k < 1; k++) blockOrthogonalize3D(v, r, j + 1); // future work: up to 4 times??? + + // b_j[t] = ||r[t]|| + blas3d::reDotProduct(beta_j, r[0], r[0]); + for (int t = 0; t < ortho_dim_size; t++) beta_j[t] = active_3D[t] ? sqrt(beta_j[t]) : 0.0; + + // Prepare next step. + // v_{j+1}[t] = r[t] / b_{j}[t] + for (int t = 0; t < ortho_dim_size; t++) { + if (active_3D[t]) { + beta_3D[t][j] = beta_j[t]; + beta_j[t] = 1.0 / beta_j[t]; + } + } + + std::vector c(ortho_dim_size); + for (int t = 0; t < ortho_dim_size; t++) c[t] = beta_j[t] == 0.0 ? 1.0 : 0.0; + blas3d::axpby(beta_j, r[0], c, v[j + 1]); + } + + void TRLM3D::reorder3D(std::vector &kSpace) + { + for (int t = 0; t < ortho_dim_size; t++) { + int i = 0; + if (reverse) { + while (i < n_kr) { + if ((i == 0) || (alpha_3D[t][i - 1] >= alpha_3D[t][i])) + i++; + else { + std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]); + blas3d::swap(t, kSpace[i], kSpace[i - 1]); + i--; + } + } + } else { + while (i < n_kr) { + if ((i == 0) || (alpha_3D[t][i - 1] <= alpha_3D[t][i])) + i++; + else { + std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]); + blas3d::swap(t, kSpace[i], kSpace[i - 1]); + i--; + } + } + } + } + } + + void TRLM3D::eigensolveFromArrowMat3D() + { + getProfile().TPSTART(QUDA_PROFILE_EIGEN); + + // Loop over the 3D problems +#pragma omp parallel for + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + + int dim = n_kr - num_locked_3D[t]; + int arrow_pos = num_keep_3D[t] - num_locked_3D[t]; + + // Eigen objects + MatrixXd A = MatrixXd::Zero(dim, dim); + ritz_mat_3D[t].resize(dim * dim); + for (int i = 0; i < dim * dim; i++) ritz_mat_3D[t][i] = 0.0; + + // Invert the spectrum due to chebyshev + if (reverse) { + for (int i = num_locked_3D[t]; i < n_kr - 1; i++) { + alpha_3D[t][i] *= -1.0; + beta_3D[t][i] *= -1.0; + } + alpha_3D[t][n_kr - 1] *= -1.0; + } + + // Construct arrow mat A_{dim,dim} + for (int i = 0; i < dim; i++) { + + // alpha_3D populates the diagonal + A(i, i) = alpha_3D[t][i + num_locked_3D[t]]; + } + + for (int i = 0; i < arrow_pos; i++) { + + // beta_3D populates the arrow + A(i, arrow_pos) = beta_3D[t][i + num_locked_3D[t]]; + A(arrow_pos, i) = beta_3D[t][i + num_locked_3D[t]]; + } + + for (int i = arrow_pos; i < dim - 1; i++) { + + // beta_3D populates the sub-diagonal + A(i, i + 1) = beta_3D[t][i + num_locked_3D[t]]; + A(i + 1, i) = beta_3D[t][i + num_locked_3D[t]]; + } + + // Eigensolve the arrow matrix + SelfAdjointEigenSolver eigensolver; + eigensolver.compute(A); + + // repopulate ritz matrix + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) ritz_mat_3D[t][dim * i + j] = eigensolver.eigenvectors().col(i)[j]; + + for (int i = 0; i < dim; i++) { + residua_3D[t][i + num_locked_3D[t]] = fabs(beta_3D[t][n_kr - 1] * eigensolver.eigenvectors().col(i)[dim - 1]); + // Update the alpha_3D array + alpha_3D[t][i + num_locked_3D[t]] = eigensolver.eigenvalues()[i]; + } + + // Put spectrum back in order + if (reverse) { + for (int i = num_locked_3D[t]; i < n_kr; i++) { alpha_3D[t][i] *= -1.0; } + } + } + } + + getProfile().TPSTOP(QUDA_PROFILE_EIGEN); + } + + void TRLM3D::computeKeptRitz3D(std::vector &kSpace) + { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + // Multi-BLAS friendly array to store part of Ritz matrix we want + std::vector> ritz_mat_keep(ortho_dim_size); + + std::vector vecs_t; + std::vector kSpace_t; + + ColorSpinorParam csParamClone(kSpace[0]); + csParamClone.create = QUDA_ZERO_FIELD_CREATE; + csParamClone.change_dim(ortho_dim, 1); + + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + int dim = n_kr - num_locked_3D[t]; + int keep = iter_keep_3D[t]; + + ritz_mat_keep[t].resize(dim * keep); + for (int j = 0; j < dim; j++) { + for (int i = 0; i < keep; i++) { ritz_mat_keep[t][j * keep + i] = ritz_mat_3D[t][i * dim + j]; } + } + + // Alias the vectors we wish to keep. + vector_ref vecs_locked(kSpace.begin() + num_locked_3D[t], + kSpace.begin() + num_locked_3D[t] + dim); + + // multiBLAS axpy. Create 3D vectors so that we may perform all t independent + // vector rotations. + vecs_t.reserve(dim); + kSpace_t.reserve(keep); + + // Create 3D arrays + for (int i = vecs_t.size(); i < dim; i++) vecs_t.push_back(csParamClone); + for (int i = kSpace_t.size(); i < keep; i++) kSpace_t.push_back(csParamClone); + + blas::zero(kSpace_t); + + // Copy to data to 3D array, zero out workspace, make pointers + for (int i = 0; i < dim; i++) blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i], vecs_locked[i]); + + // Compute the axpy + blas::block::axpy(ritz_mat_keep[t], {vecs_t.begin(), vecs_t.begin() + dim}, + {kSpace_t.begin(), kSpace_t.begin() + keep}); + + // Copy back to the 4D workspace array + + // Copy compressed Krylov + for (int i = 0; i < keep; i++) { + blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, kSpace_t[i], kSpace[num_locked_3D[t] + i]); + } + + // Update residual vector + blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[0], kSpace[n_kr]); + blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, vecs_t[0], kSpace[num_locked_3D[t] + keep]); + + // Update sub arrow matrix + for (int i = 0; i < keep; i++) + beta_3D[t][i + num_locked_3D[t]] = beta_3D[t][n_kr - 1] * ritz_mat_3D[t][dim * (i + 1) - 1]; + } + } + + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + } + + // Orthogonalise r[t][0:] against V_[t][0:j] + void TRLM3D::blockOrthogonalize3D(std::vector &vecs, std::vector &rvecs, int j) + { + for (int i = 0; i < j; i++) { + std::vector s_t(ortho_dim_size, 0.0); + std::vector unit_new(ortho_dim_size, {1.0, 0.0}); + + // Block dot products stored in s_t. + blas3d::cDotProduct(s_t, vecs[i], rvecs[0]); + for (int t = 0; t < ortho_dim_size; t++) { s_t[t] *= active_3D[t] ? -1.0 : 0.0; } + + // Block orthogonalise + blas3d::caxpby(s_t, vecs[i], unit_new, rvecs[0]); + } + } + + void TRLM3D::prepareInitialGuess3D(std::vector &kSpace, int ortho_dim_size) + { + auto nrm = blas::norm2(kSpace[0]); + if (!std::isfinite(nrm) || nrm == 0.0) { + RNG rng(kSpace[0], 1234); + spinorNoise(kSpace[0], rng, QUDA_NOISE_UNIFORM); + } + + std::vector norms(ortho_dim_size, 0.0); + std::vector zeros(ortho_dim_size, 0.0); + blas3d::reDotProduct(norms, kSpace[0], kSpace[0]); + for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]); + blas3d::axpby(norms, kSpace[0], zeros, kSpace[0]); + } + + double TRLM3D::estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) + { + RNG rng(in, 1234); + spinorNoise(in, rng, QUDA_NOISE_UNIFORM); + + // Power iteration + std::vector norms(ortho_dim_size, 0.0); + std::vector zeros(ortho_dim_size, 0.0); + for (int i = 0; i < 100; i++) { + if ((i + 1) % 10 == 0) { + blas3d::reDotProduct(norms, in, in); + for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]); + blas3d::axpby(norms, in, zeros, in); + } + mat(out, in); + std::swap(out, in); + } + + // Compute spectral radius estimate + std::vector inner_products(ortho_dim_size, 0.0); + blas3d::reDotProduct(inner_products, out, in); + + auto result = *std::max_element(inner_products.begin(), inner_products.end()); + comm_allreduce_max(result); + if (verbose_rank && getVerbosity() >= QUDA_VERBOSE) + printf("Chebyshev max = %e (rank = %d)\n", result, comm_rank_global()); + + // Increase final result by 10% for safety + return result * 1.10; + } + + void TRLM3D::computeEvals(std::vector &evecs, std::vector &evals, int size) + { + if (size == 0) size = n_conv; + if (size > (int)evecs.size()) + errorQuda("Requesting %d eigenvectors with only storage allocated for %lu", size, evecs.size()); + if (size > (int)evals.size()) + errorQuda("Requesting %d eigenvalues with only storage allocated for %lu", size, evals.size()); + + ColorSpinorParam csParamClone(evecs[0]); + ColorSpinorField tmp(csParamClone); + + std::vector> evals_t(size, std::vector(ortho_dim_size, 0.0)); + std::vector norms(ortho_dim_size, 0.0); + std::vector unit(ortho_dim_size, 1.0); + std::vector n_unit(ortho_dim_size, {-1.0, 0.0}); + + for (int i = 0; i < size; i++) { + // r = A * v_i + mat(tmp, evecs[i]); + + // lambda_i = v_i^dag A v_i / (v_i^dag * v_i) + // Block dot products stored in s_t. + blas3d::cDotProduct(evals_t[i], evecs[i], tmp); + blas3d::reDotProduct(norms, evecs[i], evecs[i]); + for (int t = 0; t < ortho_dim_size; t++) evals_t[i][t] /= sqrt(norms[t]); + + // Measure ||lambda_i*v_i - A*v_i|| + blas3d::caxpby(evals_t[i], evecs[i], n_unit, tmp); + blas3d::reDotProduct(norms, tmp, tmp); + for (int t = 0; t < ortho_dim_size; t++) residua_3D[t][i] = sqrt(norms[t]); + } + + // If size = n_conv, this routine is called post sort + if (size == n_conv) { + evals.resize(ortho_dim_size * comm_dim_global(ortho_dim) * n_conv, 0.0); + + if (verbose_rank) { + int t_offset = ortho_dim_size * comm_coord_global(3); + for (int t = 0; t < ortho_dim_size; t++) { + for (int i = 0; i < size; i++) { + + // Use printf to get data from t dim only + if (getVerbosity() >= QUDA_VERBOSE) { + printf("Eval[%02d][%04d] = (%+.16e,%+.16e) residual = %+.16e\n", t_offset + t, i, evals_t[i][t].real(), + evals_t[i][t].imag(), residua_3D[t][i]); + } + + // Transfer evals to eval array + evals[(t_offset + t) * size + i] = evals_t[i][t]; + } + } + } + } + } + +} // namespace quda diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index 8130897cbb..52f4fd1100 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -104,6 +104,10 @@ namespace quda logQuda(QUDA_VERBOSE, "Creating TR Lanczos eigensolver\n"); eig_solver = new TRLM(mat, eig_param); break; + case QUDA_EIG_TR_LANCZOS_3D: + logQuda(QUDA_VERBOSE, "Creating TR Lanczos 3-d eigensolver\n"); + eig_solver = new TRLM3D(mat, eig_param); + break; case QUDA_EIG_BLK_TR_LANCZOS: logQuda(QUDA_VERBOSE, "Creating Block TR Lanczos eigensolver\n"); eig_solver = new BLKTRLM(mat, eig_param); @@ -236,7 +240,7 @@ namespace quda int n_eig = n_conv; if (compute_svd) n_eig *= 2; kSpace.resize(n_eig); - evals.resize(n_conv); + if (eig_param->eig_type != QUDA_EIG_TR_LANCZOS_3D) evals.resize(n_conv); // Only save if outfile is defined if (strcmp(eig_param->vec_outfile, "") != 0) { @@ -580,6 +584,7 @@ namespace quda void EigenSolver::computeEvals(std::vector &evecs, std::vector &evals, int size) { + if (size == 0) size = n_conv; auto batch_size = eig_param->compute_evals_batch_size; if (size > static_cast(evecs.size())) @@ -683,7 +688,7 @@ namespace quda case QUDA_SPECTRUM_SR_EIG: printfQuda("'SR' -> sort with real(x) in increasing algebraic order, smallest first.\n"); break; case QUDA_SPECTRUM_LI_EIG: printfQuda("'LI' -> sort with imag(x) in decreasing algebraic order, largest first.\n"); break; case QUDA_SPECTRUM_SI_EIG: printfQuda("'SI' -> sort with imag(x) in increasing algebraic order, smallest first\n"); break; - default: errorQuda("Unkown spectrum type requested: %d", spec_type); + default: errorQuda("Unknown spectrum type requested: %d", spec_type); } } diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp index 5ea09e3059..7283bf39d0 100644 --- a/lib/field_cache.cpp +++ b/lib/field_cache.cpp @@ -35,6 +35,21 @@ namespace quda { } } + template FieldTmp::FieldTmp(typename T::param_type param) + { + param.create = QUDA_REFERENCE_FIELD_CREATE; + key = FieldKey(T(param)); + + auto it = cache.find(key); + if (it != cache.end() && it->second.size()) { // found an entry + tmp = std::move(it->second.top()); + it->second.pop(); // pop the defunct object + } else { // no entry found, we must allocate a new field + param.create = QUDA_ZERO_FIELD_CREATE; + tmp = T(param); + } + } + template FieldTmp::~FieldTmp() { // don't cache the field if it's empty (e.g., has been moved) diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 20cf7621ac..40a1351656 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -1355,7 +1355,7 @@ namespace quda { qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost); } else { if (is_pointer_array(order)) { - char *dst_buffer = reinterpret_cast(buffer); + char *dst_buffer = static_cast(buffer); for (int d = 0; d < site_dim; d++) { std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim); } @@ -1375,7 +1375,7 @@ namespace quda { qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice); } else { if (is_pointer_array(order)) { - const char *dst_buffer = reinterpret_cast(buffer); + const char *dst_buffer = static_cast(buffer); for (int d = 0; d < site_dim; d++) { std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim); } @@ -1389,4 +1389,9 @@ namespace quda { } } + void GaugeField::PrintMatrix(int dim, int parity, unsigned int x_cb, int rank) const + { + genericPrintMatrix(*this, dim, parity, x_cb, rank); + } + } // namespace quda diff --git a/lib/gauge_laplace.cpp b/lib/gauge_laplace.cpp index 00843bb272..752670b676 100644 --- a/lib/gauge_laplace.cpp +++ b/lib/gauge_laplace.cpp @@ -29,7 +29,7 @@ namespace quda { comm_dim[i] = comm_dim_partitioned(i); if (laplace3D == i) comm_dim[i] = 0; } - ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, dagger, comm_dim, profile); + ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, comm_dim, profile); } void GaugeLaplace::DslashXpay(cvector_ref &out, cvector_ref &in, @@ -43,7 +43,7 @@ namespace quda { comm_dim[i] = comm_dim_partitioned(i); if (laplace3D == i) comm_dim[i] = 0; } - ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, dagger, comm_dim, profile); + ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, comm_dim, profile); } void GaugeLaplace::M(cvector_ref &out, cvector_ref &in) const @@ -79,7 +79,11 @@ namespace quda { // do nothing } - GaugeLaplacePC::GaugeLaplacePC(const DiracParam ¶m) : GaugeLaplace(param) { } + GaugeLaplacePC::GaugeLaplacePC(const DiracParam ¶m) : GaugeLaplace(param) + { + for (auto i = 0; i < 4; i++) + if (laplace3D == i) errorQuda("Cannot use 3-d operator with e/o preconditioning"); + } GaugeLaplacePC::GaugeLaplacePC(const GaugeLaplacePC &dirac) : GaugeLaplace(dirac) { } diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu index 0da4412d17..2cc755bc81 100644 --- a/lib/gauge_norm.in.cu +++ b/lib/gauge_norm.in.cu @@ -1,5 +1,6 @@ #include #include +#include namespace quda { @@ -61,7 +62,7 @@ namespace quda { norm_ = norm(u, d, type); // factor of two to account for spin with MG fields } else { if constexpr (sizeof...(N) > 0) { - norm_ = norm(u, d, type, IntList()); + norm_ = norm(u, d, type, IntList()); } else { errorQuda("Nc = %d has not been instantiated", u.Ncolor()); } @@ -108,4 +109,76 @@ namespace quda { return nrm; } + template void print_matrix(const Order &o, int d, int parity, unsigned int x_cb) + { + for (int r = 0; r < o.Ncolor(); r++) { + printf("rank %d parity %d x %u row %d", comm_rank(), parity, x_cb, r); + for (int c = 0; c < o.Ncolor(); c++) { + auto value = complex(o(d, parity, x_cb, r, c)); + printf(" (%g,%g)", value.real(), value.imag()); + } + printf("\n"); + } + } + + template + void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb) + { + switch (a.FieldOrder()) { + case QUDA_FLOAT2_GAUGE_ORDER: + print_matrix(FieldOrder(a), d, parity, x_cb); + break; + case QUDA_QDP_GAUGE_ORDER: + print_matrix(FieldOrder(a), d, parity, x_cb); + break; + case QUDA_MILC_GAUGE_ORDER: + print_matrix(FieldOrder(a), d, parity, x_cb); + break; + default: errorQuda("Unsupported field order %d", a.FieldOrder()); + } + } + + template + void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, IntList) + { + if (a.Ncolor() == nColor) { + genericPrintMatrix(a, d, parity, x_cb); + } else { + if constexpr (sizeof...(N) > 0) { + genericPrintMatrix(a, d, parity, x_cb, IntList()); + } else { + errorQuda("Not supported Ncolor = %d", a.Ncolor()); + } + } + } + + template void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb) + { + genericPrintMatrix(a, d, parity, x_cb, IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@>()); + } + + void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, int rank) + { + GaugeFieldParam param(a); + param.field = const_cast(&a); + param.location = QUDA_CPU_FIELD_LOCATION; + param.create = QUDA_COPY_FIELD_CREATE; + // if field is a pinned device field then we need to clone it on the host + bool host_clone + = (a.Location() == QUDA_CUDA_FIELD_LOCATION && a.MemType() == QUDA_MEMORY_DEVICE && !use_managed_memory()) ? true : + false; + std::unique_ptr clone_a = !host_clone ? nullptr : std::make_unique(param); + const GaugeField &a_ = !host_clone ? a : *clone_a.get(); + + if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang + + switch (a.Precision()) { + case QUDA_DOUBLE_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; + case QUDA_SINGLE_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; + case QUDA_HALF_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; + case QUDA_QUARTER_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; + default: errorQuda("Precision %d not implemented", a.Precision()); + } + } + } // namespace quda diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index e5b678acfd..6fd6382488 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1434,7 +1434,7 @@ void endQuda(void) namespace quda { - void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) { double kappa = inv_param->kappa; if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { @@ -1549,7 +1549,7 @@ namespace quda { diracParam.use_mobius_fused_kernel = inv_param->use_mobius_fused_kernel; } - void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) { setDiracParam(diracParam, inv_param, pc); @@ -1567,7 +1567,7 @@ namespace quda { inv_param->cuda_prec_sloppy); } - void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) { setDiracParam(diracParam, inv_param, pc); @@ -1586,7 +1586,7 @@ namespace quda { } // The preconditioner currently mimicks the sloppy operator with no comms - void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms) + void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool comms) { setDiracParam(diracParam, inv_param, pc); @@ -1617,7 +1617,7 @@ namespace quda { inv_param->cuda_prec_precondition); } - void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool use_smeared_gauge) { setDiracParam(diracParam, inv_param, pc); @@ -1625,6 +1625,20 @@ namespace quda { diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended; diracParam.fatGauge = gaugeFatExtended; diracParam.longGauge = gaugeLongExtended; + } else if (use_smeared_gauge) { + if (!gaugeSmeared) errorQuda("No smeared gauge field present"); + if (inv_param->dslash_type == QUDA_LAPLACE_DSLASH) { + if (gaugeSmeared->GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED) { + GaugeFieldParam gauge_param(*gaugePrecise); + GaugeField gaugeEig(gauge_param); + copyExtendedGauge(gaugeEig, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION); + gaugeEig.exchangeGhost(); + std::swap(gaugeEig, *gaugeSmeared); + } + diracParam.gauge = gaugeSmeared; + } else { + errorQuda("Smeared gauge field not supported for operator %d", inv_param->dslash_type); + } } else { diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatEigensolver : gaugeEigensolver; diracParam.fatGauge = gaugeFatEigensolver; @@ -1646,7 +1660,7 @@ namespace quda { inv_param->cuda_prec_eigensolver); } - void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, const bool pc_solve) + void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, bool pc_solve) { DiracParam diracParam; DiracParam diracSloppyParam; @@ -1663,8 +1677,7 @@ namespace quda { dPre = Dirac::create(diracPreParam); } - void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam ¶m, - const bool pc_solve) + void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam ¶m, bool pc_solve) { DiracParam diracParam; DiracParam diracSloppyParam; @@ -1684,8 +1697,8 @@ namespace quda { dRef = Dirac::create(diracRefParam); } - void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, - const bool pc_solve) + void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, bool pc_solve, + bool use_smeared_gauge) { DiracParam diracParam; DiracParam diracSloppyParam; @@ -1696,7 +1709,7 @@ namespace quda { setDiracSloppyParam(diracSloppyParam, ¶m, pc_solve); bool pre_comms_flag = (param.schwarz_type != QUDA_INVALID_SCHWARZ) ? false : true; setDiracPreParam(diracPreParam, ¶m, pc_solve, pre_comms_flag); - setDiracEigParam(diracEigParam, ¶m, pc_solve); + setDiracEigParam(diracEigParam, ¶m, pc_solve, use_smeared_gauge); d = Dirac::create(diracParam); // create the Dirac operator dSloppy = Dirac::create(diracSloppyParam); @@ -2392,14 +2405,17 @@ void checkClover(QudaInvertParam *param) { quda::GaugeField *checkGauge(QudaInvertParam *param) { - quda::GaugeField *cudaGauge = nullptr; - if (param->dslash_type != QUDA_ASQTAD_DSLASH) { - if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist"); + quda::GaugeField *U = param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise : + gaugePrecise; - if (param->cuda_prec != gaugePrecise->Precision()) { - errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision()); - } + if (U == nullptr) + errorQuda("Precise gauge %sfield doesn't exist", param->dslash_type == QUDA_ASQTAD_DSLASH ? "fat " : ""); + + if (param->cuda_prec != U->Precision()) { + errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, U->Precision()); + } + if (param->dslash_type != QUDA_ASQTAD_DSLASH) { if (param->cuda_prec_sloppy != gaugeSloppy->Precision() || param->cuda_prec_precondition != gaugePrecondition->Precision() || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision() @@ -2416,18 +2432,10 @@ quda::GaugeField *checkGauge(QudaInvertParam *param) if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist"); if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist"); if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist"); - if (param->overlap) { - if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist"); - } - cudaGauge = gaugePrecise; + if (param->overlap && gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist"); } else { - if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist"); if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist"); - if (param->cuda_prec != gaugeFatPrecise->Precision()) { - errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision()); - } - if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision() || param->cuda_prec_precondition != gaugeFatPrecondition->Precision() || param->cuda_prec_refinement_sloppy != gaugeFatRefinement->Precision() @@ -2450,23 +2458,18 @@ quda::GaugeField *checkGauge(QudaInvertParam *param) if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist"); if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist"); if (gaugeFatEigensolver == nullptr) errorQuda("Eigensolver gauge fat field doesn't exist"); - if (param->overlap) { - if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist"); - } + if (param->overlap && gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist"); if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist"); if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist"); if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist"); if (gaugeLongEigensolver == nullptr) errorQuda("Eigensolver gauge long field doesn't exist"); - if (param->overlap) { - if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist"); - } - cudaGauge = gaugeFatPrecise; + if (param->overlap && gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist"); } checkClover(param); - return cudaGauge; + return U; } void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse) @@ -2582,7 +2585,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // Create the dirac operator with a sloppy and a precon. bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); - createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve); + createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve, eig_param->use_smeared_gauge); Dirac &dirac = *dEig; //------------------------------------------------------ @@ -2679,7 +2682,8 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // Transfer Eigenpairs back to host if using GPU eigensolver. The copy // will automatically rotate from device UKQCD gamma basis to the // host side gamma basis. - for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); } + memcpy(host_evals, evals.data(), sizeof(Complex) * evals.size()); + if (!(eig_param->arpack_check)) { for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i]; } @@ -4256,7 +4260,6 @@ void computeHISQForceQuda(void* const milc_momentum, using namespace quda; using namespace quda::fermion_force; - if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order); { // default settings for the unitarization @@ -4399,7 +4402,6 @@ void computeHISQForceQuda(void* const milc_momentum, GaugeFieldParam param(*gParam); param.location = QUDA_CPU_FIELD_LOCATION; param.create = QUDA_REFERENCE_FIELD_CREATE; - param.order = QUDA_MILC_GAUGE_ORDER; param.link_type = QUDA_ASQTAD_MOM_LINKS; param.reconstruct = QUDA_RECONSTRUCT_10; param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; @@ -4421,7 +4423,6 @@ void computeHISQForceQuda(void* const milc_momentum, GaugeFieldParam wParam(gParam_field); wParam.location = QUDA_CPU_FIELD_LOCATION; wParam.create = QUDA_REFERENCE_FIELD_CREATE; - wParam.order = QUDA_MILC_GAUGE_ORDER; wParam.link_type = QUDA_GENERAL_LINKS; wParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; wParam.gauge = (void *)w_link; @@ -4980,7 +4981,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, for (unsigned int i = 0; i < n_steps; i++) { if (i) in = out; - ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, comm_dim, profileWuppertal); + ApplyLaplace(out, in, *precise, 3, a, b, in, parity, comm_dim, profileWuppertal); logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out)); } @@ -5126,7 +5127,9 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable int measurement_n = 0; // The nth measurement to take gaugeObservablesQuda(&obs_param[measurement_n]); - logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); + logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n", 0, + obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1], + obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge); // set default dir_ignore = 3 for APE and STOUT for compatibility int dir_ignore = smear_param->dir_ignore; @@ -5145,13 +5148,15 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable case QUDA_GAUGE_SMEAR_HYP: HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore); break; - default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); + default: errorQuda("Unknown gauge smear type %d", smear_param->smear_type); } if ((i + 1) % smear_param->meas_interval == 0) { measurement_n++; gaugeObservablesQuda(&obs_param[measurement_n]); - logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge); + logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n", + i + 1, obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1], + obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge); } } @@ -5300,7 +5305,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG // [4] = Laplace [0] copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION); precise.exchangeGhost(); - ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, false, comm_dim, profileGFlow); + ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, comm_dim, profileGFlow); // [0] = [4] = Laplace [0] = Laplace [3] f_temp0 = f_temp4; @@ -5318,7 +5323,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG // [4] <- Laplace [1] copyExtendedGauge(precise, gout, QUDA_CUDA_FIELD_LOCATION); precise.exchangeGhost(); - ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, false, comm_dim, profileGFlow); + ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, comm_dim, profileGFlow); // [1] <- [4] f_temp1 = f_temp4; @@ -5336,7 +5341,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG // [4] <- Laplace [2] copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION); precise.exchangeGhost(); - ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, false, comm_dim, profileGFlow); + ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, comm_dim, profileGFlow); // [2] <- [4] = Laplace [2] f_temp2 = f_temp4; @@ -5590,7 +5595,7 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param) auto profile = pushProfile(profileGaugeObs); checkGaugeObservableParam(param); - if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field"); + if (!gaugePrecise) errorQuda("Cannot compute gauge observables as there is no resident gauge field"); GaugeField *gauge = nullptr; if (!gaugeSmeared) { diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp index 0e86f80a17..a4713a16b5 100644 --- a/lib/inv_mr_quda.cpp +++ b/lib/inv_mr_quda.cpp @@ -82,10 +82,10 @@ namespace quda int iter = 0; int step = 0; - bool converged = false; + bool is_done = false; PrintStats("MR", iter, r2, b2); - while (!converged) { + while (!is_done) { int k = 0; vector scale(b.size(), 1.0); @@ -142,22 +142,23 @@ namespace quda commGlobalReductionPop(); // renable global reductions for outer solver } + step++; + // FIXME - add over/under relaxation in outer loop bool compute_true_res = param.compute_true_res || param.Nsteps > 1; if (compute_true_res) { mat(r, x); r2 = blas::xmyNorm(b, r); for (auto i = 0u; i < b2.size(); i++) param.true_res[i] = sqrt(r2[i] / b2[i]); - converged = (step < param.Nsteps && r2 < stop) ? true : false; - if (!converged) blas::copy(r_sloppy, r); + is_done = (step >= param.Nsteps || r2 < stop); + if (!is_done) blas::copy(r_sloppy, r); PrintStats("MR (restart)", iter, r2, b2); } else { blas::ax(scale, r_sloppy); r2 = blas::norm2(r_sloppy); - converged = (step < param.Nsteps && r2 < stop) ? true : false; - if (!converged) blas::copy(r, r_sloppy); + is_done = (step >= param.Nsteps || r2 < stop); + if (!is_done) blas::copy(r, r_sloppy); } - step++; } PrintSummary("MR", iter, r2, b2, stopping(param.tol, b2, param.residual_type)); diff --git a/lib/laplace.cu b/lib/laplace.cu index aab6dd2d8e..5e5c3803eb 100644 --- a/lib/laplace.cu +++ b/lib/laplace.cu @@ -151,18 +151,18 @@ namespace quda LaplaceApply(cvector_ref &out, cvector_ref &in, cvector_ref &x, const GaugeField &U, int dir, double a, double b, int parity, - bool dagger, const int *comm_override, TimeProfile &profile) + const int *comm_override, TimeProfile &profile) { constexpr int nDim = 4; auto halo = ColorSpinorField::create_comms_batch(in); if (in.Nspin() == 1) { constexpr int nSpin = 1; - LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override); + LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, comm_override); Laplace laplace(arg, out, in, halo); dslash::DslashPolicyTune policy(laplace, in, halo, profile); } else if (in.Nspin() == 4) { constexpr int nSpin = 4; - LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override); + LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, comm_override); Laplace laplace(arg, out, in, halo); dslash::DslashPolicyTune policy(laplace, in, halo, profile); } else { @@ -175,11 +175,11 @@ namespace quda // out(x) = M*in = - a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu) + b*in(x) // Omits direction 'dir' from the operator. void ApplyLaplace(cvector_ref &out, cvector_ref &in, const GaugeField &U, - int dir, double a, double b, cvector_ref &x, int parity, bool dagger, + int dir, double a, double b, cvector_ref &x, int parity, const int *comm_override, TimeProfile &profile) { if constexpr (is_enabled()) { - instantiate(out, in, x, U, dir, a, b, parity, dagger, comm_override, profile); + instantiate(out, in, x, U, dir, a, b, parity, comm_override, profile); } else { errorQuda("Laplace operator has not been enabled"); } diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index c620aaaa57..dbc03f15cf 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -968,7 +968,7 @@ namespace quda } transfer->R(x_coarse[0], tmp2); - static_cast(diracCoarseResidual)->M(r_coarse, tmp_coarse); + static_cast(diracCoarseResidual)->M(r_coarse[0], tmp_coarse); #if 0 // enable to print out emulated and actual coarse-grid operator vectors for debugging setOutputPrefix(""); diff --git a/lib/multigrid.in.hpp b/lib/multigrid.in.hpp index 46da3568e0..55ac01280c 100644 --- a/lib/multigrid.in.hpp +++ b/lib/multigrid.in.hpp @@ -38,7 +38,7 @@ namespace quda { param.nVec = nVec; param.create = QUDA_NULL_FIELD_CREATE; param.fieldOrder = order; - return ColorSpinorField(param); + return getFieldTmp(param); } inline auto create_color_spinor_copy(const ColorSpinorField &f, QudaFieldOrder order) diff --git a/lib/prolongator.in.cpp b/lib/prolongator.in.cpp index e39fa7ee49..2c01ca2eb6 100644 --- a/lib/prolongator.in.cpp +++ b/lib/prolongator.in.cpp @@ -28,9 +28,9 @@ namespace quda if constexpr (use_mma) { constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - ColorSpinorField v_in = create_color_spinor_copy(in, csOrder); - ColorSpinorField v_out = create_color_spinor_copy(out, csOrder); - ColorSpinorField V = create_color_spinor_copy(v, csOrder); + auto v_in = create_color_spinor_copy(in, csOrder); + auto v_out = create_color_spinor_copy(out, csOrder); + auto V = create_color_spinor_copy(v, csOrder); BlockTransposeForward(v_in, in); V.copy(v); diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90 index 8eda362fbe..fc7666dd60 100644 --- a/lib/quda_fortran.F90 +++ b/lib/quda_fortran.F90 @@ -224,9 +224,6 @@ module quda_fortran real(8) :: temp real(8) :: clock - ! Enable auto-tuning? - QudaTune :: tune - ! Number of steps in s-step algorithms integer(4) :: nsteps diff --git a/lib/reduce_helper.cu b/lib/reduce_helper.cu index cf95032656..72d18ba433 100644 --- a/lib/reduce_helper.cu +++ b/lib/reduce_helper.cu @@ -44,7 +44,7 @@ namespace quda void apply(const qudaStream_t &stream) { // intentionally do not autotune, since this can be called inside a tuning region - auto tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity()); + auto tp = tuneLaunch(*this, false, getVerbosity()); launch_device(tp, stream, init_arg(reduce_count, n_reduce)); } }; diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp index 96fc6f33fa..ca59737cb8 100644 --- a/lib/restrictor.in.cpp +++ b/lib/restrictor.in.cpp @@ -26,9 +26,9 @@ namespace quda if constexpr (coarseColor >= fineColor) { if constexpr (use_mma) { constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - ColorSpinorField v_in = create_color_spinor_copy(in, csOrder); - ColorSpinorField v_out = create_color_spinor_copy(out, csOrder); - ColorSpinorField V = create_color_spinor_copy(v, csOrder); + auto v_in = create_color_spinor_copy(in, csOrder); + auto v_out = create_color_spinor_copy(out, csOrder); + auto V = create_color_spinor_copy(v, csOrder); bool from_non_rel = (in.Nspin() == 4) && (in[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS); BlockTransposeForward(v_in, in, from_non_rel); diff --git a/lib/solve.cpp b/lib/solve.cpp index c9ba9baf21..79fa868633 100644 --- a/lib/solve.cpp +++ b/lib/solve.cpp @@ -317,8 +317,8 @@ namespace quda getProfile().TPSTOP(QUDA_PROFILE_EPILOGUE); } - void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, - const bool pc_solve); + void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, bool pc_solve, + bool use_smeared_gauge); extern std::vector solutionResident; @@ -349,7 +349,8 @@ namespace quda // Create the dirac operator and operators for sloppy, precondition, // and an eigensolver - createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve); + createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve, + param.eig_param ? static_cast(param.eig_param)->use_smeared_gauge : false); // wrap CPU host side pointers ColorSpinorParam cpuParam(hp_b[0], param, u.X(), pc_solution, param.input_location); diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index 719bfb27e6..234fbf35a3 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -103,8 +103,11 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV endif() cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) +cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinel for heterogeneous atomic support?" ON + "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC) +mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT) mark_as_advanced(QUDA_JITIFY) mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM) mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM_TAR) diff --git a/lib/targets/hip/target_hip.cmake b/lib/targets/hip/target_hip.cmake index c8f8604040..5c69a52384 100644 --- a/lib/targets/hip/target_hip.cmake +++ b/lib/targets/hip/target_hip.cmake @@ -54,9 +54,15 @@ set(CMAKE_HIP_FLAGS_RELEASE set(CMAKE_HIP_FLAGS_HOSTDEBUG "-g" CACHE STRING "Flags used by the C++ compiler during host-debug builds.") -set(CMAKE_HIP_FLAGS_DEBUG - "-g -G" - CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.") +if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") + set(CMAKE_HIP_FLAGS_DEBUG + "-g -G" + CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for NVIDIA).") +else() + set(CMAKE_HIP_FLAGS_DEBUG + "-g" + CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for Clang).") +endif() set(CMAKE_HIP_FLAGS_SANITIZE "-g " CACHE STRING "Flags used by the C++ compiler during sanitizer debug builds.") diff --git a/lib/tune.cpp b/lib/tune.cpp index 3bcd7da964..2aefd3988f 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -152,9 +152,37 @@ namespace quda const map &getTuneCache() { return tunecache; } /** - * Deserialize tunecache from an istream, useful for reading a file or receiving from other nodes. + * @brief Distribute the tunecache from a given rank to all other nodes. + * @param[in] root_rank From which global rank to do the broadcast + * @param[out] tc Where we wish to receive the tunecache. This + * defaults to the local tunecache. + */ + static void broadcastTuneCache(int32_t root_rank = 0, map &tc_recv = tunecache); + + void joinTuneCache(const std::vector &global_tune_rank) + { + // vector of the split tunecaches + std::vector split_tc(global_tune_rank.size()); + // broadcast each tunecache to every process + for (auto i = 0u; i < global_tune_rank.size(); i++) { + broadcastTuneCache(global_tune_rank[i], split_tc[i]); + if (comm_rank() == global_tune_rank[i]) split_tc[i] = tunecache; + logQuda(QUDA_DEBUG_VERBOSE, "i = %d tune_rank = %d tc size = %lu\n", i, global_tune_rank[i], split_tc[i].size()); + } + + // now merge the maps + tunecache = split_tc[0]; + for (auto i = 1u; i < global_tune_rank.size(); i++) { tunecache.merge(split_tc[i]); } + } + + /** + * Deserialize tunecache from an istream, useful for reading a file + * or receiving from other nodes. + * @param[in] in The stream from which we are deserializing + * @param[out] tc The tunecache to which we are deserializing. This + * defaults to the local tunecache. */ - static void deserializeTuneCache(std::istream &in) + static void deserializeTuneCache(std::istream &in, map &tc = tunecache) { std::string line; std::stringstream ls; @@ -185,7 +213,7 @@ namespace quda ls.ignore(1); // throw away tab before comment getline(ls, param.comment); // assume anything remaining on the line is a comment param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this - tunecache[key] = param; + tc[key] = param; } } @@ -320,30 +348,26 @@ namespace quda } } - /** - * @brief Distribute the tunecache from a given rank to all other nodes. - * @param[in] root_rank From which global rank to do the broadcast - */ - static void broadcastTuneCache(int32_t root_rank = 0) + static void broadcastTuneCache(int32_t root_rank, map &tc_recv) { std::stringstream serialized; size_t size; - if (comm_rank_global() == root_rank) { + if (comm_rank() == root_rank) { serializeTuneCache(serialized); size = serialized.str().length(); } - comm_broadcast_global(&size, sizeof(size_t), root_rank); + comm_broadcast(&size, sizeof(size_t), root_rank); if (size > 0) { - if (comm_rank_global() == root_rank) { - comm_broadcast_global(const_cast(serialized.str().c_str()), size, root_rank); + if (comm_rank() == root_rank) { + comm_broadcast(const_cast(serialized.str().c_str()), size, root_rank); } else { std::vector serstr(size + 1); - comm_broadcast_global(serstr.data(), size, root_rank); + comm_broadcast(serstr.data(), size, root_rank); serstr[size] = '\0'; // null-terminate serialized.str(serstr.data()); - deserializeTuneCache(serialized); + deserializeTuneCache(serialized, tc_recv); } } } @@ -353,7 +377,7 @@ namespace quda */ void loadTuneCache() { - if (getTuning() == QUDA_TUNE_NO) { + if (!getTuning()) { warningQuda("Autotuning disabled"); return; } @@ -826,20 +850,20 @@ namespace quda */ void broadcast(int32_t root_rank) { - size_t size; + size_t size = 0; std::string serialized; - if (comm_rank_global() == root_rank) { + if (comm_rank() == root_rank) { serialized = serialize(); size = serialized.length(); } - comm_broadcast_global(&size, sizeof(size_t), root_rank); + comm_broadcast(&size, sizeof(size_t), root_rank); if (size > 0) { - if (comm_rank_global() == root_rank) { - comm_broadcast_global(const_cast(serialized.c_str()), size, root_rank); + if (comm_rank() == root_rank) { + comm_broadcast(const_cast(serialized.c_str()), size, root_rank); } else { std::vector serstr(size + 1); - comm_broadcast_global(serstr.data(), size, root_rank); + comm_broadcast(serstr.data(), size, root_rank); serstr[size] = '\0'; // null-terminate std::string_view deserialized(serstr.data()); deserialize(deserialized); @@ -859,7 +883,7 @@ namespace quda * Return the optimal launch parameters for a given kernel, either * by retrieving them from tunecache or autotuning on the spot. */ - TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity) + TuneParam tuneLaunch(Tunable &tunable, bool enabled, QudaVerbosity verbosity) { #ifdef LAUNCH_TIMER launchTimer.TPSTART(QUDA_PROFILE_TOTAL); @@ -880,7 +904,7 @@ namespace quda it = tunecache.find(key); // first check if we have the tuned value and return if we have it - if (enabled == QUDA_TUNE_YES && it != tunecache.end()) { + if (enabled && it != tunecache.end()) { #ifdef LAUNCH_TIMER launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE); @@ -926,7 +950,7 @@ namespace quda static TuneParam param; - if (enabled == QUDA_TUNE_NO) { + if (!enabled) { TuneParam param_default; param_default.aux = make_int4(-1, -1, -1, -1); tunable.defaultTuneParam(param_default); @@ -959,7 +983,7 @@ namespace quda - we are tuning an uber kernel in which case do the tuning on all ranks since we can't guarantee that all nodes are partaking */ - if (comm_rank_global() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) { + if (comm_rank() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) { TuneParam best_param; TuneCandidates tc(tunable.num_candidates()); float best_time; diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp index e279c0f43e..b033efb20c 100644 --- a/lib/util_quda.cpp +++ b/lib/util_quda.cpp @@ -51,17 +51,18 @@ bool getRankVerbosity() { return rank_verbosity; } +static bool tune = true; + // default has autotuning enabled but can be overridden with the QUDA_ENABLE_TUNING environment variable -QudaTune getTuning() { +bool getTuning() +{ static bool init = false; - static QudaTune tune = QUDA_TUNE_YES; - if (!init) { char *enable_tuning = getenv("QUDA_ENABLE_TUNING"); - if (!enable_tuning || strcmp(enable_tuning,"0")!=0) { - tune = QUDA_TUNE_YES; + if (!enable_tuning || strcmp(enable_tuning, "0") != 0) { + tune = true; } else { - tune = QUDA_TUNE_NO; + tune = false; } init = true; } @@ -69,6 +70,34 @@ QudaTune getTuning() { return tune; } +void setTuning(bool tuning) +{ + // first check if tuning is disabled, in which case we do nothing + static bool init = false; + static bool tune_disable = false; + if (!init) { + char *enable_tuning = getenv("QUDA_ENABLE_TUNING"); + tune_disable = (enable_tuning && strcmp(enable_tuning, "0") == 0); + init = true; + } + if (!tune_disable) tune = tuning; +} + +static std::stack tstack; + +void pushTuning(bool tuning) +{ + tstack.push(getTuning()); + setTuning(tuning); +} + +void popTuning() +{ + if (tstack.empty()) errorQuda("popTuning() called with empty stack"); + setTuning(tstack.top()); + tstack.pop(); +} + void setOutputPrefix(const char *prefix) { strncpy(prefix_, prefix, MAX_PREFIX_SIZE); @@ -80,7 +109,6 @@ void setOutputFile(FILE *outfile) outfile_ = outfile; } - static std::stack vstack; void pushVerbosity(QudaVerbosity verbosity) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b295e00d9c..da3e7a97bf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,7 +110,7 @@ if(QUDA_DIRAC_WILSON install(TARGETS deflated_invert_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() -if(QUDA_DIRAC_STAGGERED) +if(QUDA_DIRAC_STAGGERED OR QUDA_DIRAC_LAPLACE) add_executable(staggered_dslash_test staggered_dslash_test.cpp) target_link_libraries(staggered_dslash_test ${TEST_LIBS}) quda_checkbuildtest(staggered_dslash_test QUDA_BUILD_ALL_TESTS) @@ -291,6 +291,8 @@ if(QUDA_MPI OR QUDA_QMP) message(STATUS "ctest will run on ${QUDA_TEST_NUM_PROCS} processes") set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS} CACHE STRING "CTest Launcher command for QUDA's tests") +else() + set(QUDA_TEST_NUM_PROCS 1) endif() # BLAS tests @@ -963,7 +965,7 @@ endif() set_tests_properties(dslash_${DIRAC_NAME}_build_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2}) endif() - if(QUDA_LAPLACE) + if(QUDA_DIRAC_LAPLACE) set(DIRAC_NAME laplace) add_test(NAME dslash_${DIRAC_NAME}_mat_policy${pol2} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} @@ -1009,19 +1011,15 @@ if(QUDA_DIRAC_WILSON) --enable-testing true --gtest_output=xml:invert_test_wilson.xml) - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_wilson - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type wilson --ngcrkrylov 8 - --dim 2 4 6 8 --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_wilson.xml) + add_test(NAME invert_test_splitgrid_wilson + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type wilson --ngcrkrylov 8 + --dim 2 4 6 8 --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_wilson.xml) - set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() + set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) endif() if(QUDA_DIRAC_TWISTED_MASS) @@ -1186,19 +1184,15 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --gtest_output=xml:invert_test_staggered_${prec}.xml) - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_staggered_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml) - - set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() + add_test(NAME invert_test_splitgrid_staggered_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true + --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml) + + set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) add_test(NAME invert_test_asqtad_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} @@ -1207,21 +1201,15 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --nsrc 4 --nsrc-tile 4 --gtest_output=xml:invert_test_asqtad_${prec}.xml) - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_asqtad_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true - --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml) - - set_tests_properties(invert_test_splitgrid_asqtad_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() + add_test(NAME invert_test_splitgrid_asqtad_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true + --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml) - if (QUDA_LAPLACE) + if (QUDA_DIRAC_LAPLACE) add_test(NAME invert_test_laplace_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dslash-type laplace --ngcrkrylov 8 --compute-fat-long true @@ -1243,21 +1231,6 @@ if (QUDA_DIRAC_DISTANCE_PRECONDITIONING) --distance-pc-alpha0 0.1 --distance-pc-t0 4 --enable-testing true --gtest_output=xml:invert_test_wilson_distance_pc.xml) - - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_wilson - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type wilson --ngcrkrylov 8 - --dim 2 4 6 8 --niter 1000 - --distance-pc-alpha0 0.1 --distance-pc-t0 4 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_wilson_distance_pc.xml) - - set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() endif() if(QUDA_DIRAC_CLOVER) @@ -1469,7 +1442,7 @@ foreach(prec IN LISTS TEST_PRECS) --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml) endif() - if (QUDA_LAPLACE) + if (QUDA_DIRAC_LAPLACE) add_test(NAME eigensolve_test_laplace_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dslash-type laplace --compute-fat-long true diff --git a/tests/clover_force_test.cpp b/tests/clover_force_test.cpp index 4a9edb57b2..5847bec40a 100644 --- a/tests/clover_force_test.cpp +++ b/tests/clover_force_test.cpp @@ -127,7 +127,7 @@ std::tuple clover_force_test(test_t param) gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER; gauge_param.overwrite_mom = 1; - if (getTuning() == QUDA_TUNE_YES) + if (getTuning()) computeTMCloverForceQuda(mom.data(), in.data(), in0.data(), coeff.data(), nvector, &gauge_param, &inv_param, detratio); diff --git a/tests/contract_ft_test.cpp b/tests/contract_ft_test.cpp index cc793ede5b..740adf6cc9 100644 --- a/tests/contract_ft_test.cpp +++ b/tests/contract_ft_test.cpp @@ -171,9 +171,9 @@ inline int launch_contract_test(const QudaContractType cType, const std::array(buffs, X, dof); - for (int s = 0; s < nprops; ++s, off += spinor_field_floats * sizeof(Float)) { - spinorX[s] = (void *)((uintptr_t)buffs[0].data() + off); - spinorY[s] = (void *)((uintptr_t)buffs[1].data() + off); + for (int s = 0; s < nprops; ++s, off += spinor_field_floats) { + spinorX[s] = static_cast(buffs[0].data() + off); + spinorY[s] = static_cast(buffs[1].data() + off); } // Perform GPU contraction: void *d_result_ = static_cast(d_result.data()); @@ -181,9 +181,8 @@ inline int launch_contract_test(const QudaContractType cType, const std::array((Float **)spinorX.data(), (Float **)spinorY.data(), d_result.data(), cType, - src_colors, X.data(), source_position.data(), n_mom, mom.data(), fft_type.data()); + int faults = contractionFT_reference(spinorX.data(), spinorY.data(), d_result.data(), cType, src_colors, + X.data(), source_position.data(), n_mom, mom.data(), fft_type.data()); return faults; } diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index 3d77668d97..5512c3d0a2 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -428,9 +428,8 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: // My dslash should be the same as the clover dslash - for (int i = 0; i < Nsrc; i++) - clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger, - inv_param.cpu_prec, gauge_param); + clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger, + inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPC: // my matpc op @@ -1062,12 +1061,12 @@ struct DslashTestWrapper { 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min, 2 * ghost_bytes); ::testing::Test::RecordProperty("Gflops", std::to_string(1.0e-9 * flops / dslash_time.event_time)); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes); } } diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp index e325511e05..a9561ca2a5 100644 --- a/tests/eigensolve_test.cpp +++ b/tests/eigensolve_test.cpp @@ -191,17 +191,14 @@ std::vector eigensolve(test_t test_param) // Host side arrays to store the eigenpairs computed by QUDA int n_eig = eig_n_conv; if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2; - std::vector evecs(n_eig); quda::ColorSpinorParam cs_param; constructWilsonTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param); // Void pointers to host side arrays, compatible with the QUDA interface. std::vector host_evecs_ptr(n_eig); // Allocate host side memory and pointers - for (int i = 0; i < n_eig; i++) { - evecs[i] = quda::ColorSpinorField(cs_param); - host_evecs_ptr[i] = evecs[i].data(); - } + std::vector evecs(n_eig, cs_param); + for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data(); // Complex eigenvalues std::vector<__complex__ double> evals(eig_n_conv); diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp index c5197af83c..2994904e29 100644 --- a/tests/gauge_alg_test.cpp +++ b/tests/gauge_alg_test.cpp @@ -79,7 +79,7 @@ struct GaugeAlgTest : public ::testing::TestWithParam { #ifndef QUDA_BUILD_NATIVE_FFT // skip FFT tests if FFT not available const ::testing::TestInfo *const test_info = ::testing::UnitTest::GetInstance()->current_test_info(); const char *name = test_info->name(); - if (strcmp(name, "Landau_FFT") == 0 || strcmp(name, "Coulomb_FFT") == 0) { + if (strncmp(name, "Landau_FFT", 10) == 0 || strncmp(name, "Coulomb_FFT", 11) == 0) { execute = false; GTEST_SKIP(); } diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp index 1efa49ef97..c7820d8575 100644 --- a/tests/gauge_path_test.cpp +++ b/tests/gauge_path_test.cpp @@ -179,7 +179,7 @@ void gauge_force_test(force_test_t test_param) errorQuda("Unsupported gauge order %d", gauge_order); } - if (getTuning() == QUDA_TUNE_YES) { + if (getTuning()) { if (compute_force) computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3, &gauge_param); @@ -324,7 +324,7 @@ void gauge_loop_test(loop_test_t loop_param) // compute the plaquette as part of validation obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; - if (getTuning() == QUDA_TUNE_YES) { gaugeObservablesQuda(&obsParam); } + if (getTuning()) { gaugeObservablesQuda(&obsParam); } quda::host_timer_t host_timer; // Multiple execution to exclude warmup time in the first run diff --git a/tests/host_reference/contract_ft_reference.h b/tests/host_reference/contract_ft_reference.h index 697ed550e2..e811d4ec99 100644 --- a/tests/host_reference/contract_ft_reference.h +++ b/tests/host_reference/contract_ft_reference.h @@ -58,7 +58,7 @@ template inline void FourierPhase(Float z[2], const Float theta }; template -void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2, double *h_result, +void contractFTHost(void **h_prop_array_flavor_1, void **h_prop_array_flavor_2, double *h_result, const QudaContractType cType, const int src_colors, const int *X, const int *const source_position, const int n_mom, const int *const mom_modes, const QudaFFTSymmType *const fft_type) { @@ -126,8 +126,8 @@ void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2 for (int c1 = 0; c1 < src_colors; c1++) { // color contraction size_t off = nSpin * 3 * 2 * (Vh * parity + cb_idx); - contractColors(h_prop_array_flavor_1[s1 * src_colors + c1] + off, - h_prop_array_flavor_2[s2 * src_colors + c1] + off, nSpin, M.data()); + contractColors(static_cast(h_prop_array_flavor_1[s1 * src_colors + c1]) + off, + static_cast(h_prop_array_flavor_2[s2 * src_colors + c1]) + off, nSpin, M.data()); // apply gamma matrices here @@ -158,7 +158,7 @@ void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2 }; template -int contractionFT_reference(Float **spinorX, Float **spinorY, const double *const d_result, const QudaContractType cType, +int contractionFT_reference(void **spinorX, void **spinorY, const double *const d_result, const QudaContractType cType, const int src_colors, const int *X, const int *const source_position, const int n_mom, const int *const mom_modes, const QudaFFTSymmType *const fft_type) { diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 5bada81761..9b6ef102c8 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -746,17 +746,17 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx) + QudaInvertParam &inv_param, int laplace3D, int src_idx) { std::vector out_vector(1); out_vector[0] = out; - return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, src_idx); + return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, laplace3D, src_idx); } std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx) + QudaInvertParam &inv_param, int laplace3D, int src_idx) { int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; double l2r_max = 0.0; @@ -783,7 +783,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, for (int i = 0; i < multishift; i++) { auto &out = out_vector[i]; double mass = 0.5 * sqrt(inv_param.offset[i]); - stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D); mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); @@ -809,10 +809,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, auto &out = out_vector[0]; double mass = inv_param.mass; if (inv_param.solution_type == QUDA_MAT_SOLUTION) { - stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); - - // correct for the massRescale function inside invertQuda - if (is_laplace(dslash_type)) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); + stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D); } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { @@ -820,9 +817,9 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } - stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D); } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { - stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); + stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D); } else { errorQuda("Invalid staggered solution type %d", inv_param.solution_type); } @@ -844,8 +841,9 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, return {l2r_max, hqr_max}; } -double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i, - QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link) +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector &lambda, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link, + int laplace3D) { QudaInvertParam &inv_param = *(eig_param.invert_param); int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; @@ -861,10 +859,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co else sol_type = QUDA_MATDAG_MAT_SOLUTION; } else { - if (use_pc) - sol_type = QUDA_MATPC_SOLUTION; - else - sol_type = QUDA_MAT_SOLUTION; + sol_type = use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION; } // Create temporary spinors @@ -872,7 +867,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co quda::ColorSpinorField ref(csParam); if (sol_type == QUDA_MAT_SOLUTION) { - stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type); + stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D); } else if (sol_type == QUDA_MATPC_SOLUTION) { QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { @@ -880,26 +875,60 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } - stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type, laplace3D); } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) { - stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type); + stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D); } - // Compute M * x - \lambda * x - caxpy(-lambda, spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); - double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); - double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); - double l2r = sqrt(nrm2 / src2); + if (laplace3D == 3) { + auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size; + auto t_offset = spinor.X()[3] * comm_coord(3); + std::vector nrm2(spinor.X()[3] * comm_dim(3), 0.0); + std::vector src2(spinor.X()[3] * comm_dim(3), 0.0); + // Compute M * x - \lambda * x on each slice + for (auto t = 0; t < spinor.X()[3]; t++) { + auto t_global = t_offset + t; + auto offset = t * batch_size * inv_param.cpu_prec; - printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r); + for (int parity = 0; parity < spinor.SiteSubset(); parity++) { + caxpy(-lambda[t_global], static_cast(spinor.data()) + offset, static_cast(ref.data()) + offset, + batch_size, inv_param.cpu_prec); - return l2r; + nrm2[t_global] += norm_2(static_cast(ref.data()) + offset, batch_size, inv_param.cpu_prec, false); + src2[t_global] += norm_2(static_cast(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false); + + offset += spinor.VolumeCB() * stag_spinor_site_size * inv_param.cpu_prec; + } + } + + comm_allreduce_sum(nrm2); + comm_allreduce_sum(src2); + + for (auto t = 0; t < spinor.X()[3] * comm_dim(3); t++) { + auto l = ((double *)&(lambda[t]))[0]; + printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t, l, eig_param.tol, + sqrt(nrm2[t] / src2[t])); + } + auto total_nrm2 = std::accumulate(nrm2.begin(), nrm2.end(), 0); + auto total_src2 = std::accumulate(src2.begin(), src2.end(), 0); + + return sqrt(total_nrm2 / total_src2); + } else { + // Compute M * x - \lambda * x + caxpy(-lambda[0], spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double l2r = sqrt(nrm2 / src2); + printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r); + return l2r; + } } double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right, - double _Complex sigma, int i, QudaEigParam &eig_param, - quda::GaugeField &fat_link, quda::GaugeField &long_link) + const std::vector &sigma, int i, QudaEigParam &eig_param, + quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D) { + if (laplace3D == 3) errorQuda("3-d Laplace operator not supported"); QudaInvertParam &inv_param = *(eig_param.invert_param); int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false); @@ -912,10 +941,10 @@ double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, qu quda::ColorSpinorField ref(csParam); // Only `mat` is used here - stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type); + stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type, laplace3D); // Compute M * x_left - \sigma * x_right - caxpy(-sigma, spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + caxpy(-sigma[0], spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double src2 = norm_2(spinor_left.data(), spinor_left.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double l2r = sqrt(nrm2 / src2); diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 3c9543107e..307bb1db70 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -217,11 +217,13 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace * @param inv_param Invert params, used to query the solve type, etc + * @param laplace3D Whether we are working on the 3-d Laplace operator + * @param src_idx The source index we working on (when doing mutil-RHS) * @return The residual and HQ residual (if requested) */ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx); + QudaInvertParam &inv_param, int laplace3D, int src_idx); /** * @brief Verify a single- or multi-shift staggered inversion on the host @@ -231,26 +233,30 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda: * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts + * @param laplace3D Whether we are working on the 3-d Laplace operator + * @param src_idx The source index we working on (when doing mutil-RHS) * @return The residual and HQ residual (if requested) */ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx = 0); + QudaInvertParam &inv_param, int laplace3D, int src_idx = 0); /** * @brief Verify a staggered-type eigenvector * * @param spinor The host eigenvector to be verified - * @param lambda The host eigenvalue to be verified + * @param lambda The host eigenvalue(s) to be verified * @param i The number of the eigenvalue, only used when printing outputs * @param eig_param Eigensolve params, used to query the operator type, etc * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace + * @param laplace3D Whether we are working on the 3-d Laplace operator * @return The residual norm */ -double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i, - QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector &lambda, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link, + int laplace3D); /** * @brief Verify a staggered-type singular vector @@ -262,11 +268,12 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co * @param eig_param Eigensolve params, used to query the operator type, etc * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace + * @param laplace3D Whether we are working on the 3-d Laplace operator * @return The residual norm */ double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right, - double _Complex sigma, int i, QudaEigParam &eig_param, - quda::GaugeField &fat_link, quda::GaugeField &long_link); + const std::vector &sigma, int i, QudaEigParam &eig_param, + quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D); /** * @brief Verify the spinor distance reweighting diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index ce285a6b99..741efdf0b7 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -426,7 +426,7 @@ static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_ /* This function only computes one direction @dir * */ -void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *sitelink, void *const *sitelink_ex, +void gauge_force_reference_dir(void *refMom, int dir, double eb3, quda::GaugeField &u, quda::GaugeField &u_ex, QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths, const lattice_t &lat, bool compute_force) { @@ -437,26 +437,30 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s for (int i = 0; i < num_paths; i++) { if (prec == QUDA_DOUBLE_PRECISION) { double *my_loop_coeff = (double *)loop_coeff; - compute_path_product((dsu3_matrix *)staple, (dsu3_matrix **)sitelink_ex, path_dir[i], length[i], my_loop_coeff[i], - dir, lat); + compute_path_product((dsu3_matrix *)staple, u_ex.data_array().data, path_dir[i], length[i], + my_loop_coeff[i], dir, lat); } else { float *my_loop_coeff = (float *)loop_coeff; - compute_path_product((fsu3_matrix *)staple, (fsu3_matrix **)sitelink_ex, path_dir[i], length[i], my_loop_coeff[i], - dir, lat); + compute_path_product((fsu3_matrix *)staple, u_ex.data_array().data, path_dir[i], length[i], + my_loop_coeff[i], dir, lat); } } if (compute_force) { if (prec == QUDA_DOUBLE_PRECISION) { - update_mom((danti_hermitmat *)refMom, dir, (dsu3_matrix **)sitelink, (dsu3_matrix *)staple, (double)eb3, lat); + update_mom((danti_hermitmat *)refMom, dir, u.data_array().data, (dsu3_matrix *)staple, (double)eb3, + lat); } else { - update_mom((fanti_hermitmat *)refMom, dir, (fsu3_matrix **)sitelink, (fsu3_matrix *)staple, (float)eb3, lat); + update_mom((fanti_hermitmat *)refMom, dir, u.data_array().data, (fsu3_matrix *)staple, (float)eb3, + lat); } } else { if (prec == QUDA_DOUBLE_PRECISION) { - update_gauge((dsu3_matrix *)refMom, dir, (dsu3_matrix **)sitelink, (dsu3_matrix *)staple, (double)eb3, lat); + update_gauge((dsu3_matrix *)refMom, dir, u.data_array().data, (dsu3_matrix *)staple, (double)eb3, + lat); } else { - update_gauge((fsu3_matrix *)refMom, dir, (fsu3_matrix **)sitelink, (fsu3_matrix *)staple, (float)eb3, lat); + update_gauge((fsu3_matrix *)refMom, dir, u.data_array().data, (fsu3_matrix *)staple, (float)eb3, + lat); } } host_free(staple); @@ -465,8 +469,6 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir, int *length, void *loop_coeff, int num_paths, bool compute_force) { - void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)}; - // created extended field quda::lat_dim_t R; for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d); @@ -475,13 +477,12 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ** param.gauge_order = QUDA_QDP_GAUGE_ORDER; param.t_boundary = QUDA_PERIODIC_T; - auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R); + auto qdp_ex = quda::createExtendedGauge(u.data_array().data, param, R); lattice_t lat(*qdp_ex); - void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)}; for (int dir = 0; dir < 4; dir++) { - gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, u.Precision(), path_dir[dir], length, loop_coeff, - num_paths, lat, compute_force); + gauge_force_reference_dir(refMom, dir, eb3, u, *qdp_ex, u.Precision(), path_dir[dir], length, loop_coeff, num_paths, + lat, compute_force); } delete qdp_ex; @@ -490,8 +491,6 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ** void gauge_loop_trace_reference(quda::GaugeField &u, std::vector &loop_traces, double factor, int **input_path, int *length, double *path_coeff, int num_paths) { - void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)}; - // create extended field quda::lat_dim_t R; for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d); @@ -499,20 +498,20 @@ void gauge_loop_trace_reference(quda::GaugeField &u, std::vector setGaugeParam(param); param.gauge_order = QUDA_QDP_GAUGE_ORDER; param.t_boundary = QUDA_PERIODIC_T; - - auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R); + auto qdp_ex = quda::createExtendedGauge(u.data_array().data, param, R); lattice_t lat(*qdp_ex); - void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)}; std::vector loop_tr_dbl(2 * num_paths); for (int i = 0; i < num_paths; i++) { if (u.Precision() == QUDA_DOUBLE_PRECISION) { - dcomplex tr = compute_loop_trace((dsu3_matrix **)sitelink_ex, input_path[i], length[i], path_coeff[i], lat); + dcomplex tr + = compute_loop_trace(qdp_ex->data_array().data, input_path[i], length[i], path_coeff[i], lat); loop_tr_dbl[2 * i] = factor * tr.real; loop_tr_dbl[2 * i + 1] = factor * tr.imag; } else { - dcomplex tr = compute_loop_trace((fsu3_matrix **)sitelink_ex, input_path[i], length[i], path_coeff[i], lat); + dcomplex tr + = compute_loop_trace(qdp_ex->data_array().data, input_path[i], length[i], path_coeff[i], lat); loop_tr_dbl[2 * i] = factor * tr.real; loop_tr_dbl[2 * i + 1] = factor * tr.imag; } diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index 1bef7b74bb..5da6dc1a7a 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -117,9 +117,9 @@ void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, siz void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, QudaPrecision precision, size_t nhops) { if (precision == QUDA_SINGLE_PRECISION) { - computeLinkOrderedOuterProduct((fsu3_vector *)src, dst, nhops); + computeLinkOrderedOuterProduct(static_cast(src), dst, nhops); } else { - computeLinkOrderedOuterProduct((dsu3_vector *)src, dst, nhops); + computeLinkOrderedOuterProduct(static_cast(src), dst, nhops); } } @@ -342,7 +342,7 @@ template class LoadStore void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index, Matrix<3, std::complex> *const mat) const; - void loadMatrixFromField(const Real *const field, int oddBit, int dir, int half_lattice_index, + void loadMatrixFromField(const Real *const *const field, int oddBit, int dir, int half_lattice_index, Matrix<3, std::complex> *const mat) const; void storeMatrixToField(const Matrix<3, std::complex> &mat, int oddBit, int half_lattice_index, @@ -352,12 +352,12 @@ template class LoadStore Real *const) const; void addMatrixToField(const Matrix<3, std::complex> &mat, int oddBit, int dir, int half_lattice_index, - Real coeff, Real *const) const; + Real coeff, Real *const *const) const; void storeMatrixToMomentumField(const Matrix<3, std::complex> &mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real *const) const; - Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const; - void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const; + Real getData(const Real *const *const field, int idx, int dir, int oddBit, int offset, int hfv) const; + void addData(Real *const *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const; int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const; int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const; }; @@ -423,16 +423,16 @@ int LoadStore::half_idx_conversion_normal2ex(int half_lattice_index, const } template -Real LoadStore::getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const +Real LoadStore::getData(const Real *const *const field, int idx, int dir, int oddBit, int offset, int hfv) const { // QDP format - return ((Real **)field)[dir][(hfv * oddBit + idx) * 18 + offset]; + return field[dir][(hfv * oddBit + idx) * 18 + offset]; } template -void LoadStore::addData(Real *const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const +void LoadStore::addData(Real *const *const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const { // QDP format - ((Real **)field)[dir][(hfv * oddBit + idx) * 18 + offset] += v; + field[dir][(hfv * oddBit + idx) * 18 + offset] += v; } template @@ -455,7 +455,7 @@ void LoadStore::loadMatrixFromField(const Real *const field, int oddBit, i } template -void LoadStore::loadMatrixFromField(const Real *const field, int oddBit, int dir, int half_lattice_index, +void LoadStore::loadMatrixFromField(const Real *const *const field, int oddBit, int dir, int half_lattice_index, Matrix<3, std::complex> *const mat) const { #ifdef MULTI_GPU @@ -464,11 +464,10 @@ void LoadStore::loadMatrixFromField(const Real *const field, int oddBit, i int hfv = Vh; #endif - // const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18; int offset = 0; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { - (*mat)(i, j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv)); + (*mat)(i, j) = getData(field, half_lattice_index, dir, oddBit, offset++, hfv); (*mat)(i, j) += std::complex(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv)); } } @@ -515,23 +514,18 @@ void LoadStore::addMatrixToField(const Matrix<3, std::complex> &mat, template void LoadStore::addMatrixToField(const Matrix<3, std::complex> &mat, int oddBit, int dir, - int half_lattice_index, Real coeff, Real *const field) const + int half_lattice_index, Real coeff, Real *const *const field) const { - #ifdef MULTI_GPU int hfv = Vh_ex; #else int hfv = Vh; #endif - // Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18; int offset = 0; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { - // local_field[offset++] += coeff*mat(i,j).real(); addData(field, half_lattice_index, dir, oddBit, offset++, coeff * mat(i, j).real(), hfv); - - // local_field[offset++] += coeff*mat(i,j).imag(); addData(field, half_lattice_index, dir, oddBit, offset++, coeff * mat(i, j).imag(), hfv); } } @@ -762,7 +756,8 @@ void computeOneLinkSite( #else const int[], #endif - int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore &ls, Real *const output) + int half_lattice_index, const Real *const *const oprod, int sig, Real coeff, const LoadStore &ls, + Real *const *const output) { if (GOES_FORWARDS(sig)) { typename ColorMatrix::Type colorMatW; @@ -777,7 +772,7 @@ void computeOneLinkSite( } template -void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output) +void computeOneLinkField(const int dim[4], const Real *const *const oprod, int sig, Real coeff, Real *const *const output) { int volume = 1; for (int dir = 0; dir < 4; ++dir) volume *= dim[dir]; @@ -795,10 +790,10 @@ void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Rea // middleLinkKernel compiles for now, but lots of debugging to be done template void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code. - const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, + const int dim[4], void *const oprod, const Real *const Qprev, const Real *const *const link, int sig, int mu, Real coeff, const LoadStore &ls, // pass a function object to read from and write to matrix fields - Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod) + Real *const Pmu, Real *const P3, Real *const Qmu, Real *const *const newOprod) { const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false; const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false; @@ -842,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett if (Qprev == NULL) { if (sig_positive) { - ls.loadMatrixFromField(oprod, 1 - oddBit, sig, point_d, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), 1 - oddBit, sig, point_d, &colorMatY); } else { - ls.loadMatrixFromField(oprod, oddBit, OPP_DIR(sig), point_c, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY); colorMatY = conj(colorMatY); } } else { // Qprev != NULL - ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), oddBit, point_c, &colorMatY); } colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY; @@ -880,9 +875,9 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett } // computeMiddleLinkSite template -void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, +void computeMiddleLinkField(const int dim[4], void *const oprod, const Real *const Qprev, const Real *const *const link, int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu, - Real *const newOprod) + Real *const *const newOprod) { int volume = 1; @@ -911,9 +906,9 @@ template void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code. const int dim[4], const Real *const P3, const Real *const Qprod, // why? - const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, + const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore &ls, // pass a function object to read from and write to matrix fields - Real *const shortP, Real *const newOprod) + Real *const shortP, Real *const *const newOprod) { const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false; @@ -979,8 +974,8 @@ void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better template void computeSideLinkField(const int dim[4], const Real *const P3, const Real *const Qprod, // why? - const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, - Real *const newOprod) + const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff, + Real *const shortP, Real *const *const newOprod) { // Need some way of setting half_volume int volume = 1; @@ -1005,10 +1000,10 @@ void computeSideLinkField(const int dim[4], const Real *const P3, template void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code. - const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, - int sig, int mu, Real coeff, Real accumu_coeff, + const int dim[4], const Real *const oprod, const Real *const Qprev, + const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore &ls, // pass a function object to read from and write to matrix fields - Real *const shortP, Real *const newOprod) + Real *const shortP, Real *const *const newOprod) { const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false; @@ -1092,8 +1087,9 @@ void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better } // allLinkKernel template -void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, - int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod) +void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, + const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff, + Real *const shortP, Real *const *const newOprod) { int volume = 1; for (int dir = 0; dir < 4; ++dir) volume *= dim[dir]; @@ -1132,8 +1128,8 @@ template struct PathCoefficients { }; template -void doHisqStaplesForceCPU(const int dim[4], PathCoefficients staple_coeff, Real *oprod, Real *link, - Real **tempmat, Real *newOprod) +void doHisqStaplesForceCPU(const int dim[4], PathCoefficients staple_coeff, Real **oprod, Real **link, + Real **tempmat, Real **newOprod) { Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff; @@ -1217,9 +1213,6 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda uint64_t len = 1; for (int dir = 0; dir < 4; ++dir) len *= X_[dir]; #endif - // allocate memory for temporary fields - void *tempmat[6]; - for (int i = 0; i < 6; i++) { tempmat[i] = safe_malloc(len * 18 * precision); } PathCoefficients act_path_coeff; act_path_coeff.one = path_coeff[0]; @@ -1229,27 +1222,29 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda act_path_coeff.seven = path_coeff[4]; act_path_coeff.lepage = path_coeff[5]; - void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)}; - void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)}; - void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)}; - if (precision == QUDA_DOUBLE_PRECISION) { - doHisqStaplesForceCPU(X_, act_path_coeff, reinterpret_cast(oprod_array), - reinterpret_cast(link_array), (double **)tempmat, - reinterpret_cast(noprod_array)); - } else if (precision == QUDA_SINGLE_PRECISION) { - doHisqStaplesForceCPU(X_, act_path_coeff, reinterpret_cast(oprod_array), - reinterpret_cast(link_array), (float **)tempmat, - reinterpret_cast(noprod_array)); + if (precision == QUDA_SINGLE_PRECISION) { + // allocate memory for temporary fields + float *tempmat[6]; + for (int i = 0; i < 6; i++) { tempmat[i] = static_cast(safe_malloc(len * 18 * precision)); } + doHisqStaplesForceCPU(X_, act_path_coeff, oprod.data_array().data, link.data_array().data, + tempmat, newOprod->data_array().data); + for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); } + } else if (precision == QUDA_DOUBLE_PRECISION) { + // allocate memory for temporary fields + double *tempmat[6]; + for (int i = 0; i < 6; i++) { tempmat[i] = static_cast(safe_malloc(len * 18 * precision)); } + doHisqStaplesForceCPU(X_, act_path_coeff, oprod.data_array().data, + link.data_array().data, tempmat, newOprod->data_array().data); + for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); } } else { errorQuda("Unsupported precision"); } - - for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); } } template -void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, - int sig, Real coeff, const LoadStore &ls, Real *const output) +void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const *const oprod, + const Real *const *const link, int sig, Real coeff, const LoadStore &ls, + Real *const *const output) { if (GOES_FORWARDS(sig)) { @@ -1296,8 +1291,8 @@ void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *c } template -void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, - Real *const output) +void computeLongLinkField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig, + Real coeff, Real *const *const output) { int volume = 1; for (int dir = 0; dir < 4; ++dir) volume *= dim[dir]; @@ -1321,16 +1316,13 @@ void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeFiel for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d]; QudaPrecision precision = oprod.Precision(); - void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)}; - void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)}; - void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)}; for (int sig = 0; sig < 4; ++sig) { if (precision == QUDA_SINGLE_PRECISION) { - computeLongLinkField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), - sig, coeff, reinterpret_cast(noprod_array)); + computeLongLinkField(X_, oprod.data_array().data, link.data_array().data, sig, coeff, + newOprod->data_array().data); } else if (precision == QUDA_DOUBLE_PRECISION) { - computeLongLinkField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), - sig, coeff, reinterpret_cast(noprod_array)); + computeLongLinkField(X_, oprod.data_array().data, link.data_array().data, sig, coeff, + newOprod->data_array().data); } else { errorQuda("Unrecognised precision"); } @@ -1344,8 +1336,8 @@ void completeForceSite(int half_lattice_index, #else const int[], #endif - const Real *const oprod, const Real *const link, int sig, const LoadStore &ls, - Real *const mom) + const Real *const *const oprod, const Real *const *const link, int sig, + const LoadStore &ls, Real *const mom) { typename ColorMatrix::Type colorMatX, colorMatY, linkW; @@ -1366,7 +1358,8 @@ void completeForceSite(int half_lattice_index, } template -void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom) +void completeForceField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig, + Real *const mom) { int volume = dim[0] * dim[1] * dim[2] * dim[3]; const int half_volume = volume / 2; @@ -1384,15 +1377,13 @@ void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda: for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d]; QudaPrecision precision = oprod.Precision(); - void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)}; - void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)}; for (int sig = 0; sig < 4; ++sig) { if (precision == QUDA_SINGLE_PRECISION) { - completeForceField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), sig, + completeForceField(X_, oprod.data_array().data, link.data_array().data, sig, mom->data()); } else if (precision == QUDA_DOUBLE_PRECISION) { - completeForceField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), - sig, mom->data()); + completeForceField(X_, oprod.data_array().data, link.data_array().data, sig, + mom->data()); } else { errorQuda("Unrecognised precision"); } diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 212549dc38..eb7eb3d604 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -29,13 +29,19 @@ * @param[in] oddBit The odd/even bit for the site index * @param[in] daggerBit Perform the ordinary dslash (0) or Hermitian conjugate (1) * @param[in] dslash_type The type of Dslash operation + * @param[in] laplace3D Whether we applying the 3-d laplace operator + * (in the case of dslash_type being QUDA_LAPLACE_DSLASH) */ template void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const real_t *const *longlink, const real_t *const *ghostFatlink, const real_t *const *ghostLonglink, const real_t *spinorField, const real_t *const *fwd_nbr_spinor, - const real_t *const *back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type) + const real_t *const *back_nbr_spinor, int oddBit, int daggerBit, + QudaDslashType dslash_type, int laplace3D) { + if (laplace3D < 4 && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("laplace3D = %d only supported for Laplace dslash (%d requested)", laplace3D, dslash_type); + #pragma omp parallel for for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0; @@ -66,6 +72,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r int offset = stag_spinor_site_size * sid; for (int dir = 0; dir < 8; dir++) { + if (laplace3D == dir / 2) continue; // skip dimensions if needed const int nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1; const real_t *fatlnk = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, ghostFatlinkEven, ghostFatlinkOdd, 1, 1); @@ -108,7 +115,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r } void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type) + const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D) { // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { @@ -139,27 +146,26 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF long_link.Ghost()[3].data()}; if (in.Precision() == QUDA_DOUBLE_PRECISION) { - staggeredDslashReference(static_cast(out.data()), reinterpret_cast(qdp_fatlink), + staggeredDslashReference(out.data(), reinterpret_cast(qdp_fatlink), reinterpret_cast(qdp_longlink), reinterpret_cast(ghost_fatlink), - reinterpret_cast(ghost_longlink), static_cast(in.data()), + reinterpret_cast(ghost_longlink), in.data(), reinterpret_cast(in.fwdGhostFaceBuffer), - reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type, + laplace3D); } else if (in.Precision() == QUDA_SINGLE_PRECISION) { - staggeredDslashReference(static_cast(out.data()), reinterpret_cast(qdp_fatlink), + staggeredDslashReference(out.data(), reinterpret_cast(qdp_fatlink), reinterpret_cast(qdp_longlink), reinterpret_cast(ghost_fatlink), - reinterpret_cast(ghost_longlink), static_cast(in.data()), + reinterpret_cast(ghost_longlink), in.data(), reinterpret_cast(in.fwdGhostFaceBuffer), - reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type, + laplace3D); } } void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D) { - // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { - errorQuda("The spinor precision and gauge precision are not the same"); - } + checkPrecision(in, fat_link); // assert we have full-parity spinors if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) @@ -169,11 +175,12 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not // have the minus sign. Inverting the expected dagger convention // solves this discrepancy. - stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type); - stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type); + stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type, laplace3D); + stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type, laplace3D); if (dslash_type == QUDA_LAPLACE_DSLASH) { - double kappa = 1.0 / (8 + mass); + int dimension = laplace3D < 4 ? 3 : 4; + double kappa = 1.0 / (2 * dimension + mass); xpay(in.data(), kappa, out.data(), out.Length(), out.Precision()); } else { axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision()); @@ -181,12 +188,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel } void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D) { - // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { - errorQuda("The spinor precision and gauge precision are not the same"); - } + checkPrecision(in, fat_link); // assert we have full-parity spinors if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) @@ -197,15 +201,16 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga quda::ColorSpinorField tmp(csParam); // Apply mat in sequence - stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type); - stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type); + stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type, laplace3D); + stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type, laplace3D); } void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type) + const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type, + int laplace3D) { - // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); } + if (laplace3D < 4) errorQuda("Cannot use 3-d operator with e/o preconditioning"); + checkPrecision(in, fat_link); // assert we have single-parity spinors if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET) @@ -225,8 +230,8 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi quda::ColorSpinorField tmp(csParam); // dagger bit does not matter - stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type); - stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type); + stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type, laplace3D); + stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type, laplace3D); double msq_x4 = mass * mass * 4; if (in.Precision() == QUDA_DOUBLE_PRECISION) { diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index 34625abfa3..557be21bd0 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -21,9 +21,13 @@ void setDims(int *); * @param[in] oddBit 0 for D_eo, 1 for D_oe * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type); + const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D); /** * @brief Apply the full parity staggered-type dslash @@ -35,9 +39,13 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF * @param[in] mass Mass for the dslash operator * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D); /** * @brief Apply the full parity staggered-type matdag_mat @@ -49,9 +57,13 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel * @param[in] mass Mass for the dslash operator * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D); /** * @brief Apply the even-even or odd-odd preconditioned staggered dslash @@ -64,6 +76,11 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga * @param[in] dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator * @param[in] parity Parity of preconditioned dslash * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type); + const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type, + int laplace3D); diff --git a/tests/io_test.cpp b/tests/io_test.cpp index 2618b0e479..25296b02b9 100644 --- a/tests/io_test.cpp +++ b/tests/io_test.cpp @@ -31,6 +31,9 @@ TEST_P(GaugeIOTest, verify) gauge_param.cpu_prec = ::testing::get<0>(param); if (!quda::is_enabled(gauge_param.cpu_prec)) GTEST_SKIP(); gauge_param.cuda_prec = gauge_param.cpu_prec; + gauge_param.cuda_prec_sloppy = gauge_param.cpu_prec; + gauge_param.cuda_prec_precondition = gauge_param.cpu_prec; + gauge_param.cuda_prec_eigensolver = gauge_param.cpu_prec; gauge_param.t_boundary = QUDA_PERIODIC_T; // Allocate host side memory for the gauge field. diff --git a/tests/laph_test.cpp b/tests/laph_test.cpp index ac20bce407..75f244d406 100644 --- a/tests/laph_test.cpp +++ b/tests/laph_test.cpp @@ -114,8 +114,8 @@ auto laph_test(test_t param) std::vector qudaRes(nSink * nEv * Lt * nSpin, 0.); int X[4] = {xdim, ydim, zdim, tdim}; - laphSinkProject((__complex__ double *)qudaRes.data(), (void **)snkPtr.data(), nSink, tileSink, - (void **)evPtr.data(), nEv, tileEv, &invParam, X); + laphSinkProject((__complex__ double *)qudaRes.data(), snkPtr.data(), nSink, tileSink, evPtr.data(), nEv, tileEv, + &invParam, X); printfQuda("laphSinkProject Done: %g secs, %g Gflops\n", invParam.secs, invParam.gflops / invParam.secs); auto tol = getTolerance(cuda_prec); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index a9b499d71a..1b5609bfda 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -85,14 +85,16 @@ struct StaggeredDslashTestWrapper { for (int i = 0; i < Nsrc; i++) { switch (dtest_type) { case dslash_test_type::Dslash: - stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type); + stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type, laplace3D); break; case dslash_test_type::MatPC: - stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type); + stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type, laplace3D); + break; + case dslash_test_type::Mat: + stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D); break; - case dslash_test_type::Mat: stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type); break; case dslash_test_type::MatDagMat: - stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type); + stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D); break; default: errorQuda("Test type %d not defined", static_cast(dtest_type)); } @@ -406,12 +408,12 @@ struct StaggeredDslashTestWrapper { size_t ghost_bytes = cudaSpinor[0].GhostBytes(); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes); printfQuda( diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 93ae358896..95ebff044b 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -75,12 +75,6 @@ void init() gauge_param = newQudaGaugeParam(); setStaggeredGaugeParam(gauge_param); - QudaGaugeSmearParam smear_param; - if (gauge_smear) { - smear_param = newQudaGaugeSmearParam(); - setGaugeSmearParam(smear_param); - } - // Though no inversions are performed, the inv_param // structure contains all the information we need to // construct the dirac operator. @@ -166,10 +160,20 @@ std::vector eigensolve(test_t test_param) eig_param.compute_svd = ::testing::get<3>(test_param); eig_param.spectrum = ::testing::get<4>(test_param); - if (eig_param.use_pc) - eig_inv_param.solution_type = QUDA_MATPC_SOLUTION; - else - eig_inv_param.solution_type = QUDA_MAT_SOLUTION; + eig_inv_param.solution_type = eig_param.use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION; + + // whether we are using the resident smeared gauge or not + eig_param.use_smeared_gauge = gauge_smear; + + if (dslash_type == QUDA_LAPLACE_DSLASH) { + int dimension = laplace3D < 4 ? 3 : 4; + eig_inv_param.kappa = 1.0 / (2 * dimension + mass); + eig_inv_param.laplace3D = laplace3D; + if (dimension == 3) { + eig_param.ortho_dim = laplace3D; + eig_param.ortho_dim_size_local = tdim; + } + } // For gtest testing, we prohibit the use of polynomial acceleration as // the fine tuning required can inhibit convergence of an otherwise @@ -192,24 +196,65 @@ std::vector eigensolve(test_t test_param) if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) display_test_info(eig_param); + // Gauge Smearing Routines + if (gauge_smear) { + quda::host_timer_t host_timer; + host_timer.start(); // start the timer + + std::vector obs_param(gauge_smear_steps / measurement_interval + 1); + for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) { + obs_param[i] = newQudaGaugeObservableParam(); + obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE; + obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE; + obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + } + + // We here set all the problem parameters for all possible smearing types. + QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam(); + setGaugeSmearParam(smear_param); + + switch (smear_param.smear_type) { + case QUDA_GAUGE_SMEAR_APE: + case QUDA_GAUGE_SMEAR_STOUT: + case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: + case QUDA_GAUGE_SMEAR_HYP: { + performGaugeSmearQuda(&smear_param, obs_param.data()); + break; + } + + // Here we use a typical use case which is different from simple smearing in that + // the user will want to compute the plaquette values to compute the gauge energy. + case QUDA_GAUGE_SMEAR_WILSON_FLOW: + case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: { + for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) { + obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE; + } + performWFlowQuda(&smear_param, obs_param.data()); + break; + } + default: errorQuda("Undefined gauge smear type %d given", smear_param.smear_type); + } + + host_timer.stop(); + printfQuda("Time for gauge smearing = %f\n", host_timer.last()); + } + // Vector construct START //---------------------------------------------------------------------------- // Host side arrays to store the eigenpairs computed by QUDA int n_eig = eig_n_conv; if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2; - std::vector evecs(n_eig); quda::ColorSpinorParam cs_param; constructStaggeredTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param); // Void pointers to host side arrays, compatible with the QUDA interface. std::vector host_evecs_ptr(n_eig); // Allocate host side memory and pointers - for (int i = 0; i < n_eig; i++) { - evecs[i] = quda::ColorSpinorField(cs_param); - host_evecs_ptr[i] = evecs[i].data(); - } + std::vector evecs(n_eig, cs_param); + for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data(); // Complex eigenvalues - std::vector<__complex__ double> evals(eig_n_conv); + int n_batch = laplace3D == 3 ? eig_param.ortho_dim_size_local * comm_dim(3) : 1; + std::vector<__complex__ double> evals(eig_n_conv * n_batch); // Vector construct END //---------------------------------------------------------------------------- @@ -226,19 +271,20 @@ std::vector eigensolve(test_t test_param) printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last()); // Perform host side verification of eigenvector if requested. - // ... std::vector residua(eig_n_conv, 0.0); // Perform host side verification of eigenvector if requested. if (verify_results) { for (int i = 0; i < eig_n_conv; i++) { if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) { - double _Complex sigma = evals[i]; + std::vector sigma(n_batch); + for (auto b = 0; b < n_batch; b++) sigma[b] = evals[b * eig_n_conv + i]; residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP, - cpuLongQDP); + cpuLongQDP, laplace3D); } else { - double _Complex lambda = evals[i]; - residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP); + std::vector lambda(n_batch); + for (auto b = 0; b < n_batch; b++) lambda[b] = evals[b * eig_n_conv + i]; + residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP, laplace3D); } } } @@ -285,7 +331,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (is_laplace(dslash_type)) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON"); if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp index c73c85378e..8918c3df4d 100644 --- a/tests/staggered_eigensolve_test_gtest.hpp +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -32,6 +32,9 @@ bool skip_test(test_t test_param) auto compute_svd = ::testing::get<3>(test_param); auto spectrum = ::testing::get<4>(test_param); + // 3-d operator only supported for Laplace + if (eig_type == QUDA_EIG_TR_LANCZOS_3D && dslash_type != QUDA_LAPLACE_DSLASH) return true; + // Reverse engineer the operator type QudaSolveType combo_solve_type = get_solve_type(use_norm_op, use_pc, compute_svd); if (combo_solve_type == QUDA_DIRECT_PC_SOLVE) { @@ -87,6 +90,7 @@ bool skip_test(test_t test_param) case QUDA_LAPLACE_DSLASH: switch (eig_type) { case QUDA_EIG_TR_LANCZOS: + case QUDA_EIG_TR_LANCZOS_3D: case QUDA_EIG_BLK_TR_LANCZOS: if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true; break; @@ -117,6 +121,8 @@ TEST_P(StaggeredEigensolveTest, verify) auto eig_type = ::testing::get<0>(GetParam()); if (eig_type == QUDA_EIG_IR_ARNOLDI || eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10; auto tol = factor * eig_param.tol; + if (dslash_type == QUDA_LAPLACE_DSLASH) laplace3D = eig_type == QUDA_EIG_TR_LANCZOS_3D ? 3 : 4; + for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol); } @@ -143,6 +149,9 @@ auto hermitian_solvers = Values(QUDA_EIG_TR_LANCZOS, QUDA_EIG_BLK_TR_LANCZOS, QU // Can solve non-hermitian systems auto non_hermitian_solvers = Values(QUDA_EIG_IR_ARNOLDI); +// Batched solvers for 3-d operators +auto batched_solvers = Values(QUDA_EIG_TR_LANCZOS_3D); + // Eigensolver spectrum types auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG); auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG, @@ -171,3 +180,9 @@ INSTANTIATE_TEST_SUITE_P(DirectFull, StaggeredEigensolveTest, ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum), gettestname); + +// 3-d full system direct solve +INSTANTIATE_TEST_SUITE_P(DirectFull3D, StaggeredEigensolveTest, + ::testing::Combine(batched_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), + Values(QUDA_BOOLEAN_FALSE), hermitian_spectrum), + gettestname); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index a098324ef6..77d9cd5a75 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -450,9 +450,9 @@ std::vector> solve(test_t param) // Create an appropriate subset of the full out_multishift vector std::vector out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift}; - res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param); + res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param, laplace3D); } else { - res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, n); + res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, laplace3D, n); } } } @@ -510,7 +510,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_LAPLACE=ON"); + if (is_laplace(dslash_type)) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON"); if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 7ef2c270b3..1b659eec59 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -53,6 +53,10 @@ bool skip_test(test_t param) // MR struggles with the staggered and asqtad spectrum, it's not MR's fault if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) return true; + + // BiCGStab-L is unstable with staggered operator with low precision + // we could likely restore this when 20-bit quark fields are merged in + if (inverter_type == QUDA_BICGSTABL_INVERTER && prec_sloppy < QUDA_SINGLE_PRECISION) return true; } // CG3 is rather unstable with low precision diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp index 265fed1248..5ddfc52956 100644 --- a/tests/su3_test.cpp +++ b/tests/su3_test.cpp @@ -18,19 +18,6 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) -// Smearing variables -double gauge_smear_rho = 0.1; -double gauge_smear_epsilon = 0.1; -double gauge_smear_alpha = 0.6; -double gauge_smear_alpha1 = 0.75; -double gauge_smear_alpha2 = 0.6; -double gauge_smear_alpha3 = 0.3; -int gauge_smear_steps = 50; -QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; -int gauge_smear_dir_ignore = -1; -int measurement_interval = 5; -bool su_project = true; - void display_test_info() { printfQuda("running the following test:\n"); @@ -68,49 +55,6 @@ void display_test_info() return; } -void add_su3_option_group(std::shared_ptr quda_app) -{ - CLI::TransformPairs gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE}, - {"stout", QUDA_GAUGE_SMEAR_STOUT}, - {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}, - {"hyp", QUDA_GAUGE_SMEAR_HYP}, - {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW}, - {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}}; - - // Option group for SU(3) related options - auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests"); - - opgroup - ->add_option( - "--su3-smear-type", - gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)") - ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); - ; - opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)"); - - opgroup->add_option("--su3-smear-rho", gauge_smear_rho, - "rho coefficient for Stout and Over-Improved Stout smearing (default 0.1)"); - - opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon, - "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)"); - - opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)"); - opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); - opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); - - opgroup->add_option( - "--su3-smear-dir-ignore", gauge_smear_dir_ignore, - "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)"); - - opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)"); - - opgroup->add_option("--su3-measurement-interval", measurement_interval, - "Measure the field energy and/or topological charge every Nth step (default 5) "); - - opgroup->add_option("--su3-project", su_project, - "Project smeared gauge onto su3 manifold at measurement interval (default true)"); -} - int main(int argc, char **argv) { @@ -257,7 +201,7 @@ int main(int argc, char **argv) QudaGaugeObservableParam *obs_param = new QudaGaugeObservableParam[gauge_smear_steps / measurement_interval + 1]; for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) { obs_param[i] = newQudaGaugeObservableParam(); - obs_param[i].compute_plaquette = QUDA_BOOLEAN_FALSE; + obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE; obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE; obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; } diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index 467b688768..0c15b5ea56 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -73,7 +73,6 @@ quda::mgarray mg_vec_partfile = {}; QudaInverterType inv_type; bool inv_deflate = false; bool inv_multigrid = false; -bool gauge_smear = false; QudaInverterType precon_type = QUDA_INVALID_INVERTER; QudaSchwarzType precon_schwarz_type = QUDA_INVALID_SCHWARZ; QudaAcceleratorType precon_accelerator_type = QUDA_INVALID_ACCELERATOR; @@ -303,13 +302,18 @@ double eofa_mq2 = 0.85; double eofa_mq3 = 1.0; // SU(3) smearing options +bool gauge_smear = false; +QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; double gauge_smear_rho = 0.1; -double gauge_smear_epsilon = 1.0; +double gauge_smear_epsilon = 0.1; double gauge_smear_alpha = 0.6; +double gauge_smear_alpha1 = 0.75; +double gauge_smear_alpha2 = 0.6; +double gauge_smear_alpha3 = 0.3; int gauge_smear_steps = 5; -QudaWFlowType wflow_type = QUDA_WFLOW_TYPE_WILSON; +int gauge_smear_dir_ignore = -1; int measurement_interval = 5; -QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; +bool su_project = true; // contract options QudaContractType contract_type = QUDA_CONTRACT_TYPE_STAGGERED_FT_T; @@ -411,6 +415,7 @@ namespace CLI::TransformPairs eig_type_map {{"trlm", QUDA_EIG_TR_LANCZOS}, {"blktrlm", QUDA_EIG_BLK_TR_LANCZOS}, + {"trlm-3d", QUDA_EIG_TR_LANCZOS_3D}, {"iram", QUDA_EIG_IR_ARNOLDI}, {"blkiram", QUDA_EIG_BLK_IR_ARNOLDI}}; @@ -454,11 +459,12 @@ namespace {"SR", QUDA_SPECTRUM_SR_EIG}, {"LR", QUDA_SPECTRUM_LR_EIG}, {"SM", QUDA_SPECTRUM_SM_EIG}, {"LM", QUDA_SPECTRUM_LM_EIG}, {"SI", QUDA_SPECTRUM_SI_EIG}, {"LI", QUDA_SPECTRUM_LI_EIG}}; - CLI::TransformPairs wflow_type_map {{"wilson", QUDA_WFLOW_TYPE_WILSON}, - {"symanzik", QUDA_WFLOW_TYPE_SYMANZIK}}; - - CLI::TransformPairs gauge_smear_type_map { - {"ape", QUDA_GAUGE_SMEAR_APE}, {"stout", QUDA_GAUGE_SMEAR_STOUT}, {"ovr-imp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}}; + CLI::TransformPairs gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE}, + {"stout", QUDA_GAUGE_SMEAR_STOUT}, + {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}, + {"hyp", QUDA_GAUGE_SMEAR_HYP}, + {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW}, + {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}}; CLI::TransformPairs setup_type_map {{"test", QUDA_TEST_VECTOR_SETUP}, {"null", QUDA_TEST_VECTOR_SETUP}}; @@ -766,7 +772,6 @@ std::shared_ptr make_app(std::string app_description, std::string app_n void add_eigen_option_group(std::shared_ptr quda_app) { - CLI::QUDACheckedTransformer prec_transform(precision_map); // Option group for Eigensolver related options auto opgroup = quda_app->add_option_group("Eigensolver", "Options controlling eigensolver"); @@ -1121,9 +1126,16 @@ void add_eofa_option_group(std::shared_ptr quda_app) void add_su3_option_group(std::shared_ptr quda_app) { - // Option group for SU(3) related options auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests"); + + opgroup + ->add_option( + "--su3-smear-type", + gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)") + ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); + ; + opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)"); opgroup->add_option("--su3-smear-rho", gauge_smear_rho, @@ -1131,18 +1143,23 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup->add_option( "--su3-smear-epsilon", gauge_smear_epsilon, - "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 1.0)"); + "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 0.1)"); - opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)"); + opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)"); + opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); + opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); - opgroup->add_option("--su3-wflow-type", wflow_type, "The type of action to use in the wilson flow (default wilson)") - ->transform(CLI::QUDACheckedTransformer(wflow_type_map)); + opgroup->add_option( + "--su3-smear-dir-ignore", gauge_smear_dir_ignore, + "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)"); - opgroup->add_option("--su3-smear-type", gauge_smear_type, "The type of smearing to use (default stout)") - ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); + opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)"); opgroup->add_option("--su3-measurement-interval", measurement_interval, - "Measure the field energy and topological charge every Nth step (default 5) "); + "Measure the field energy and/or topological charge every Nth step (default 5) "); + + opgroup->add_option("--su3-project", su_project, + "Project smeared gauge onto su3 manifold at measurement interval (default true)"); } void add_madwf_option_group(std::shared_ptr quda_app) diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index aec9e84c7e..67ae7b7372 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -555,10 +555,14 @@ extern double eofa_mq3; extern double gauge_smear_rho; extern double gauge_smear_epsilon; extern double gauge_smear_alpha; +extern double gauge_smear_alpha1; +extern double gauge_smear_alpha2; +extern double gauge_smear_alpha3; extern int gauge_smear_steps; -extern QudaWFlowType wflow_type; +extern int gauge_smear_dir_ignore; extern int measurement_interval; extern QudaGaugeSmearType gauge_smear_type; +extern bool su_project; extern double smear_coeff; extern int smear_n_steps; diff --git a/tests/utils/host_blas.cpp b/tests/utils/host_blas.cpp index c9d8c63a59..e918f77589 100644 --- a/tests/utils/host_blas.cpp +++ b/tests/utils/host_blas.cpp @@ -63,20 +63,20 @@ void mxpy(void *x, void *y, int len, QudaPrecision precision) } // returns the square of the L2 norm of the vector -template inline double norm2(real_t *v, int len) +template inline double norm2(real_t *v, int len, bool global) { double sum = 0.0; for (int i = 0; i < len; i++) sum += v[i] * v[i]; - quda::comm_allreduce_sum(sum); + if (global) quda::comm_allreduce_sum(sum); return sum; } -double norm_2(void *v, int len, QudaPrecision precision) +double norm_2(void *v, int len, QudaPrecision precision, bool global) { if (precision == QUDA_DOUBLE_PRECISION) - return norm2((double *)v, len); + return norm2((double *)v, len, global); else - return norm2((float *)v, len); + return norm2((float *)v, len, global); } // performs the operation y[i] = x[i] + a*y[i] diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index c63d124dc8..473a527ac5 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -250,7 +250,7 @@ void exchange_llfat_init(QudaPrecision prec); void exchange_llfat_cleanup(void); // Implemented in host_blas.cpp -double norm_2(void *vector, int len, QudaPrecision precision); +double norm_2(void *vector, int len, QudaPrecision precision, bool global = true); void mxpy(void *x, void *y, int len, QudaPrecision precision); void ax(double a, void *x, int len, QudaPrecision precision); void cax(double _Complex a, void *x, int len, QudaPrecision precision); diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index 8ec29c514b..58cc3fb758 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -221,6 +221,7 @@ const char *get_eig_type_str(QudaEigType type) switch (type) { case QUDA_EIG_TR_LANCZOS: ret = "trlm"; break; case QUDA_EIG_BLK_TR_LANCZOS: ret = "blktrlm"; break; + case QUDA_EIG_TR_LANCZOS_3D: ret = "trlm_3d"; break; case QUDA_EIG_IR_ARNOLDI: ret = "iram"; break; case QUDA_EIG_BLK_IR_ARNOLDI: ret = "blkiram"; break; default: ret = "unknown eigensolver"; break; diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index c6aa1218e9..5255b18315 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -5,12 +5,16 @@ void setGaugeSmearParam(QudaGaugeSmearParam &smear_param) { + smear_param.smear_type = gauge_smear_type; smear_param.alpha = gauge_smear_alpha; smear_param.rho = gauge_smear_rho; smear_param.epsilon = gauge_smear_epsilon; smear_param.n_steps = gauge_smear_steps; smear_param.meas_interval = measurement_interval; - smear_param.smear_type = gauge_smear_type; + smear_param.alpha1 = gauge_smear_alpha1; + smear_param.alpha2 = gauge_smear_alpha2; + smear_param.alpha3 = gauge_smear_alpha3; + smear_param.dir_ignore = gauge_smear_dir_ignore; smear_param.struct_size = sizeof(smear_param); } @@ -125,16 +129,17 @@ void setInvertParam(QudaInvertParam &inv_param) { // Set dslash type inv_param.dslash_type = dslash_type; + int dimension = laplace3D < 4 ? 3 : 4; // Use kappa or mass normalisation if (kappa == -1.0) { inv_param.mass = mass; inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass)); - if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (8 + mass); + if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (2 * dimension + mass); } else { inv_param.kappa = kappa; inv_param.mass = 0.5 / kappa - (1.0 + 3.0 / anisotropy); - if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 8.0; + if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 2 * dimension; } if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Kappa = %.8f Mass = %.8f\n", inv_param.kappa, inv_param.mass); @@ -936,7 +941,8 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param) // Solver params inv_param.verbosity = verbosity; inv_param.mass = mass; - inv_param.kappa = kappa = 1.0 / (8.0 + mass); // for Laplace operator + int dimension = laplace3D < 4 ? 3 : 4; + inv_param.kappa = 1.0 / (2 * dimension + mass); // for Laplace operator inv_param.laplace3D = laplace3D; // for Laplace operator if (Nsrc < Nsrc_tile || Nsrc % Nsrc_tile != 0) @@ -1003,7 +1009,7 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param) inv_param.solve_type = solve_type; inv_param.matpc_type = matpc_type; inv_param.dagger = QUDA_DAG_NO; - inv_param.mass_normalization = QUDA_MASS_NORMALIZATION; + inv_param.mass_normalization = dslash_type == QUDA_LAPLACE_DSLASH ? QUDA_KAPPA_NORMALIZATION : QUDA_MASS_NORMALIZATION; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = prec;