Merge pull request #1530 from lattice/hotfix/pool

Hotfix/pool
lattice · Jan 13, 2025 · 9765847 · 9765847
2 parents b58f1ec + ab7b1ae
commit 9765847
Show file tree

Hide file tree

Showing 13 changed files with 48 additions and 41 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -404,6 +404,9 @@ endif()
 
 find_package(Threads REQUIRED)
 if(QUDA_OPENMP)
+  if(${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
+    message(FATAL_ERROR "Host compiler (nvc++) not supported with QUDA_OPENMP=ON")
+  endif()
   find_package(OpenMP REQUIRED)
 endif()
 

diff --git a/include/dirac_quda.h b/include/dirac_quda.h
@@ -1911,14 +1911,12 @@ namespace quda {
     const bool gpu_setup; /** Where to do the coarse-operator construction*/
     mutable bool init_gpu; /** Whether this instance did the GPU allocation or not */
     mutable bool init_cpu; /** Whether this instance did the CPU allocation or not */
-    const bool mapped; /** Whether we allocate Y and X GPU fields in mapped memory or not */
 
     /**
        @brief Allocate the Y and X fields
        @param[in] gpu Whether to allocate on gpu (true) or cpu (false)
-       @param[in] mapped whether to put gpu allocations into mapped memory
      */
-    void createY(bool gpu = true, bool mapped = false) const;
+    void createY(bool gpu = true) const;
 
     /**
        @brief Allocate the Yhat and Xinv fields
@@ -1935,9 +1933,8 @@ namespace quda {
     /**
        @param[in] param Parameters defining this operator
        @param[in] gpu_setup Whether to do the setup on GPU or CPU
-       @param[in] mapped Set to true to put Y and X fields in mapped memory
      */
-    DiracCoarse(const DiracParam &param, bool gpu_setup=true, bool mapped=false);
+    DiracCoarse(const DiracParam &param, bool gpu_setup=true);
 
     /**
        @param[in] param Parameters defining this operator

diff --git a/include/quda.h b/include/quda.h
@@ -776,11 +776,6 @@ extern "C" {
     /** Whether to use eigenvectors for the nullspace or, if the coarsest instance deflate*/
     QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL];
 
-    /** Minimize device memory allocations during the adaptive setup,
-        placing temporary fields in mapped memory instad of device
-        memory */
-    QudaBoolean setup_minimize_memory;
-
     /** Whether to compute the null vectors or reload them */
     QudaComputeNullVector compute_null_vector;
 
@@ -1814,6 +1809,15 @@ extern "C" {
    */
   void destroyDeflationQuda(void *df_instance);
 
+  /**
+   * @brief Flush the memory pools associated with the supplied type.
+   * At present this only supports the options QUDA_MEMORY_DEVICE and
+   * QUDA_MEMORY_HOST_PINNED, and any other type will result in an
+   * error.
+   * @param[in] type The memory type whose pool we wish to flush.
+   */
+  void flushPoolQuda(QudaMemoryType type);
+
   void setMPICommHandleQuda(void *mycomm);
 
   // Parameter set for quark smearing operations

diff --git a/lib/check_params.h b/lib/check_params.h
@@ -1024,12 +1024,6 @@ void printQudaMultigridParam(QudaMultigridParam *param) {
 #endif
   }
 
-#ifdef INIT_PARAM
-  P(setup_minimize_memory, QUDA_BOOLEAN_FALSE);
-#else
-  P(setup_minimize_memory, QUDA_BOOLEAN_INVALID);
-#endif
-
   P(compute_null_vector, QUDA_COMPUTE_NULL_VECTOR_INVALID);
   P(generate_all_levels, QUDA_BOOLEAN_INVALID);
 

diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp
@@ -7,7 +7,7 @@
 
 namespace quda {
 
-  DiracCoarse::DiracCoarse(const DiracParam &param, bool gpu_setup, bool mapped) :
+  DiracCoarse::DiracCoarse(const DiracParam &param, bool gpu_setup) :
     Dirac(param),
     mass(param.mass),
     mu(param.mu),
@@ -23,8 +23,7 @@ namespace quda {
     enable_cpu(false),
     gpu_setup(gpu_setup),
     init_gpu(gpu_setup),
-    init_cpu(!gpu_setup),
-    mapped(mapped)
+    init_cpu(!gpu_setup)
   {
     initializeCoarse();
   }
@@ -59,8 +58,7 @@ namespace quda {
     enable_cpu(Y_h ? true : false),
     gpu_setup(true),
     init_gpu(enable_gpu ? false : true),
-    init_cpu(enable_cpu ? false : true),
-    mapped(Y_d->MemType() == QUDA_MEMORY_MAPPED)
+    init_cpu(enable_cpu ? false : true)
   {
 
     constexpr QudaGaugeFieldOrder gOrder = QUDA_MILC_GAUGE_ORDER;
@@ -116,12 +114,11 @@ namespace quda {
     enable_cpu(dirac.enable_cpu),
     gpu_setup(dirac.gpu_setup),
     init_gpu(enable_gpu ? false : true),
-    init_cpu(enable_cpu ? false : true),
-    mapped(dirac.mapped)
+    init_cpu(enable_cpu ? false : true)
   {
   }
 
-  void DiracCoarse::createY(bool gpu, bool mapped) const
+  void DiracCoarse::createY(bool gpu) const
   {
     int ndim = transfer->Vectors().Ndim();
     lat_dim_t x;
@@ -146,7 +143,6 @@ namespace quda {
     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
     gParam.nFace = 1;
     gParam.geometry = QUDA_COARSE_GEOMETRY;
-    if (mapped) gParam.mem_type = QUDA_MEMORY_MAPPED;
 
     int pad = std::max( { (x[0]*x[1]*x[2])/2, (x[1]*x[2]*x[3])/2, (x[0]*x[2]*x[3])/2, (x[0]*x[1]*x[3])/2 } );
     gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone
@@ -228,7 +224,7 @@ namespace quda {
 
   void DiracCoarse::initializeCoarse()
   {
-    createY(gpu_setup, mapped);
+    createY(gpu_setup);
 
     if (!gpu_setup) {
 
@@ -318,7 +314,7 @@ namespace quda {
     switch(location) {
     case QUDA_CUDA_FIELD_LOCATION:
       if (enable_gpu) return;
-      createY(true, mapped);
+      createY(true);
       createYhat(true);
       Y_d->copy(*Y_h);
       if (need_aos_gauge_copy) { Y_aos_d->copy(*Y_d); }

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
@@ -1336,6 +1336,20 @@ void freeCloverQuda(void)
 
 void flushChronoQuda(int i) { flushChrono(i); }
 
+void flushPoolQuda(QudaMemoryType type)
+{
+  switch (type) {
+  case QUDA_MEMORY_DEVICE:
+    pool::flush_device();
+    break;
+  case QUDA_MEMORY_HOST_PINNED:
+    pool::flush_pinned();
+    break;
+  default:
+    errorQuda("MemoryType %d not supported", type);
+  }
+}
+
 void endQuda(void)
 {
   if (!initialized) return;
@@ -2755,7 +2769,6 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param)
   Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec);
   csParam.setPrecision(Bprec, Bprec, true);
   if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION) csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-  csParam.mem_type = mg_param.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? QUDA_MEMORY_MAPPED : QUDA_MEMORY_DEVICE;
   B.resize(mg_param.n_vec[0]);
 
   if (mg_param.transfer_type[0] == QUDA_TRANSFER_COARSE_KD || mg_param.transfer_type[0] == QUDA_TRANSFER_OPTIMIZED_KD

diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
@@ -42,7 +42,6 @@ namespace quda {
         } else {
           csParam.create = QUDA_NULL_FIELD_CREATE;
           resize(r0, b.size(), csParam);
-          blas::copy(r0, r);
         }
       } else {
         csParam.create = QUDA_NULL_FIELD_CREATE;
@@ -126,7 +125,10 @@ namespace quda {
     if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
       mat(r, x);
       r2 = blas::xmyNorm(b, r);
+      for (auto i = 0u; i < b.size(); i++)
+        if (b2[i] == 0) b2[i] = r2[i];
       for (auto i = 0u; i < x.size(); i++) std::swap(y[i], x[i]);
+      create_alias(x_sloppy, x); // need to update alias since x has been swapped
     } else {
       blas::copy(r, b);
       r2 = b2;
@@ -145,6 +147,8 @@ namespace quda {
     if (param.precision != param.precision_sloppy) {
       blas::copy(r_sloppy, r);
       blas::copy(r0, param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO ? b : r);
+    } else {
+      if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) blas::copy(r0, r);
     }
 
     getProfile().TPSTOP(QUDA_PROFILE_INIT);

diff --git a/lib/inv_bicgstabl_quda.cpp b/lib/inv_bicgstabl_quda.cpp
@@ -521,6 +521,8 @@ namespace quda {
         blas::xpay(b, -1.0, r_full);
         r2 = b2; // dummy setting
       }
+      for (auto i = 0u; i < b.size(); i++)
+        if (b2[i] == 0) b2[i] = r2[i];
       blas::copy(y, x); // we accumulate into y
     } else {
       blas::copy(r_full, b); // r[0] = b

diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
@@ -2491,9 +2491,6 @@ void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precis
     mg_param.setup_location[i] = QUDA_CUDA_FIELD_LOCATION; // setup_location[i];
   }
 
-  // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory
-  mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE;
-
   // coarsening the spin on the first restriction is undefined for staggered fields.
   mg_param.spin_block_size[0] = 0;
   if (input_struct.optimized_kd == QUDA_TRANSFER_OPTIMIZED_KD

diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
@@ -412,8 +412,7 @@ namespace quda
       diracParam.dslash_use_mma = param.mg_global.dslash_use_mma[param.level + 1];
       diracParam.allow_truncation = (param.mg_global.allow_truncation == QUDA_BOOLEAN_TRUE) ? true : false;
 
-      diracCoarseResidual = new DiracCoarse(diracParam, param.setup_location == QUDA_CUDA_FIELD_LOCATION ? true : false,
-                                            param.mg_global.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? true : false);
+      diracCoarseResidual = new DiracCoarse(diracParam, param.setup_location == QUDA_CUDA_FIELD_LOCATION ? true : false);
 
       // create smoothing operators
       diracParam.dirac = const_cast<Dirac *>(param.matSmooth->Expose());

diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
@@ -774,6 +774,7 @@ namespace quda
 
     void flush_pinned()
     {
+      logQuda(QUDA_DEBUG_VERBOSE, "Flushing host pinned memory pool\n");
       if (pinned_memory_pool) {
         for (auto it : pinnedCache) { host_free(it.second); }
         pinnedCache.clear();
@@ -782,6 +783,7 @@ namespace quda
 
     void flush_device()
     {
+      logQuda(QUDA_DEBUG_VERBOSE, "Flushing device memory pool\n");
       if (device_memory_pool) {
         for (auto it : deviceCache) { device_free(it.second); }
         deviceCache.clear();

diff --git a/lib/targets/hip/malloc.cpp b/lib/targets/hip/malloc.cpp
@@ -713,6 +713,7 @@ namespace quda
     void flush_pinned()
     {
       if (pinned_memory_pool) {
+        logQuda(QUDA_DEBUG_VERBOSE, "Flushing host pinned memory pool\n");
         std::multimap<size_t, void *>::iterator it;
         for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
           void *ptr = it->second;
@@ -725,7 +726,8 @@ namespace quda
     void flush_device()
     {
       if (device_memory_pool) {
-        std::multimap<size_t, void *>::iterator it;
+      logQuda(QUDA_DEBUG_VERBOSE, "Flushing device memory pool\n");
+      std::multimap<size_t, void *>::iterator it;
         for (it = deviceCache.begin(); it != deviceCache.end(); it++) {
           void *ptr = it->second;
           device_free(ptr);

diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp
@@ -609,9 +609,6 @@ void setMultigridParam(QudaMultigridParam &mg_param)
     mg_param.setup_location[i] = setup_location[i];
   }
 
-  // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory
-  mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE;
-
   // only coarsen the spin on the first restriction
   mg_param.spin_block_size[0] = 2;
 
@@ -1226,9 +1223,6 @@ void setStaggeredMultigridParam(QudaMultigridParam &mg_param)
     nu_post[i] = 2;
   }
 
-  // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory
-  mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE;
-
   // coarsening the spin on the first restriction is undefined for staggered fields.
   mg_param.spin_block_size[0] = 0;