Skip to content

Commit

Permalink
Merge pull request #1530 from lattice/hotfix/pool
Browse files Browse the repository at this point in the history
Hotfix/pool
  • Loading branch information
weinbe2 authored Jan 13, 2025
2 parents b58f1ec + ab7b1ae commit 9765847
Show file tree
Hide file tree
Showing 13 changed files with 48 additions and 41 deletions.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,9 @@ endif()

find_package(Threads REQUIRED)
if(QUDA_OPENMP)
if(${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
message(FATAL_ERROR "Host compiler (nvc++) not supported with QUDA_OPENMP=ON")
endif()
find_package(OpenMP REQUIRED)
endif()

Expand Down
7 changes: 2 additions & 5 deletions include/dirac_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -1911,14 +1911,12 @@ namespace quda {
const bool gpu_setup; /** Where to do the coarse-operator construction*/
mutable bool init_gpu; /** Whether this instance did the GPU allocation or not */
mutable bool init_cpu; /** Whether this instance did the CPU allocation or not */
const bool mapped; /** Whether we allocate Y and X GPU fields in mapped memory or not */

/**
@brief Allocate the Y and X fields
@param[in] gpu Whether to allocate on gpu (true) or cpu (false)
@param[in] mapped whether to put gpu allocations into mapped memory
*/
void createY(bool gpu = true, bool mapped = false) const;
void createY(bool gpu = true) const;

/**
@brief Allocate the Yhat and Xinv fields
Expand All @@ -1935,9 +1933,8 @@ namespace quda {
/**
@param[in] param Parameters defining this operator
@param[in] gpu_setup Whether to do the setup on GPU or CPU
@param[in] mapped Set to true to put Y and X fields in mapped memory
*/
DiracCoarse(const DiracParam &param, bool gpu_setup=true, bool mapped=false);
DiracCoarse(const DiracParam &param, bool gpu_setup=true);

/**
@param[in] param Parameters defining this operator
Expand Down
14 changes: 9 additions & 5 deletions include/quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -776,11 +776,6 @@ extern "C" {
/** Whether to use eigenvectors for the nullspace or, if the coarsest instance deflate*/
QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL];

/** Minimize device memory allocations during the adaptive setup,
placing temporary fields in mapped memory instad of device
memory */
QudaBoolean setup_minimize_memory;

/** Whether to compute the null vectors or reload them */
QudaComputeNullVector compute_null_vector;

Expand Down Expand Up @@ -1814,6 +1809,15 @@ extern "C" {
*/
void destroyDeflationQuda(void *df_instance);

/**
* @brief Flush the memory pools associated with the supplied type.
* At present this only supports the options QUDA_MEMORY_DEVICE and
* QUDA_MEMORY_HOST_PINNED, and any other type will result in an
* error.
* @param[in] type The memory type whose pool we wish to flush.
*/
void flushPoolQuda(QudaMemoryType type);

void setMPICommHandleQuda(void *mycomm);

// Parameter set for quark smearing operations
Expand Down
6 changes: 0 additions & 6 deletions lib/check_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -1024,12 +1024,6 @@ void printQudaMultigridParam(QudaMultigridParam *param) {
#endif
}

#ifdef INIT_PARAM
P(setup_minimize_memory, QUDA_BOOLEAN_FALSE);
#else
P(setup_minimize_memory, QUDA_BOOLEAN_INVALID);
#endif

P(compute_null_vector, QUDA_COMPUTE_NULL_VECTOR_INVALID);
P(generate_all_levels, QUDA_BOOLEAN_INVALID);

Expand Down
18 changes: 7 additions & 11 deletions lib/dirac_coarse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

namespace quda {

DiracCoarse::DiracCoarse(const DiracParam &param, bool gpu_setup, bool mapped) :
DiracCoarse::DiracCoarse(const DiracParam &param, bool gpu_setup) :
Dirac(param),
mass(param.mass),
mu(param.mu),
Expand All @@ -23,8 +23,7 @@ namespace quda {
enable_cpu(false),
gpu_setup(gpu_setup),
init_gpu(gpu_setup),
init_cpu(!gpu_setup),
mapped(mapped)
init_cpu(!gpu_setup)
{
initializeCoarse();
}
Expand Down Expand Up @@ -59,8 +58,7 @@ namespace quda {
enable_cpu(Y_h ? true : false),
gpu_setup(true),
init_gpu(enable_gpu ? false : true),
init_cpu(enable_cpu ? false : true),
mapped(Y_d->MemType() == QUDA_MEMORY_MAPPED)
init_cpu(enable_cpu ? false : true)
{

constexpr QudaGaugeFieldOrder gOrder = QUDA_MILC_GAUGE_ORDER;
Expand Down Expand Up @@ -116,12 +114,11 @@ namespace quda {
enable_cpu(dirac.enable_cpu),
gpu_setup(dirac.gpu_setup),
init_gpu(enable_gpu ? false : true),
init_cpu(enable_cpu ? false : true),
mapped(dirac.mapped)
init_cpu(enable_cpu ? false : true)
{
}

void DiracCoarse::createY(bool gpu, bool mapped) const
void DiracCoarse::createY(bool gpu) const
{
int ndim = transfer->Vectors().Ndim();
lat_dim_t x;
Expand All @@ -146,7 +143,6 @@ namespace quda {
gParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
gParam.nFace = 1;
gParam.geometry = QUDA_COARSE_GEOMETRY;
if (mapped) gParam.mem_type = QUDA_MEMORY_MAPPED;

int pad = std::max( { (x[0]*x[1]*x[2])/2, (x[1]*x[2]*x[3])/2, (x[0]*x[2]*x[3])/2, (x[0]*x[1]*x[3])/2 } );
gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone
Expand Down Expand Up @@ -228,7 +224,7 @@ namespace quda {

void DiracCoarse::initializeCoarse()
{
createY(gpu_setup, mapped);
createY(gpu_setup);

if (!gpu_setup) {

Expand Down Expand Up @@ -318,7 +314,7 @@ namespace quda {
switch(location) {
case QUDA_CUDA_FIELD_LOCATION:
if (enable_gpu) return;
createY(true, mapped);
createY(true);
createYhat(true);
Y_d->copy(*Y_h);
if (need_aos_gauge_copy) { Y_aos_d->copy(*Y_d); }
Expand Down
15 changes: 14 additions & 1 deletion lib/interface_quda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1336,6 +1336,20 @@ void freeCloverQuda(void)

void flushChronoQuda(int i) { flushChrono(i); }

void flushPoolQuda(QudaMemoryType type)
{
switch (type) {
case QUDA_MEMORY_DEVICE:
pool::flush_device();
break;
case QUDA_MEMORY_HOST_PINNED:
pool::flush_pinned();
break;
default:
errorQuda("MemoryType %d not supported", type);
}
}

void endQuda(void)
{
if (!initialized) return;
Expand Down Expand Up @@ -2755,7 +2769,6 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param)
Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec);
csParam.setPrecision(Bprec, Bprec, true);
if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION) csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
csParam.mem_type = mg_param.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? QUDA_MEMORY_MAPPED : QUDA_MEMORY_DEVICE;
B.resize(mg_param.n_vec[0]);

if (mg_param.transfer_type[0] == QUDA_TRANSFER_COARSE_KD || mg_param.transfer_type[0] == QUDA_TRANSFER_OPTIMIZED_KD
Expand Down
6 changes: 5 additions & 1 deletion lib/inv_bicgstab_quda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ namespace quda {
} else {
csParam.create = QUDA_NULL_FIELD_CREATE;
resize(r0, b.size(), csParam);
blas::copy(r0, r);
}
} else {
csParam.create = QUDA_NULL_FIELD_CREATE;
Expand Down Expand Up @@ -126,7 +125,10 @@ namespace quda {
if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
mat(r, x);
r2 = blas::xmyNorm(b, r);
for (auto i = 0u; i < b.size(); i++)
if (b2[i] == 0) b2[i] = r2[i];
for (auto i = 0u; i < x.size(); i++) std::swap(y[i], x[i]);
create_alias(x_sloppy, x); // need to update alias since x has been swapped
} else {
blas::copy(r, b);
r2 = b2;
Expand All @@ -145,6 +147,8 @@ namespace quda {
if (param.precision != param.precision_sloppy) {
blas::copy(r_sloppy, r);
blas::copy(r0, param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO ? b : r);
} else {
if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) blas::copy(r0, r);
}

getProfile().TPSTOP(QUDA_PROFILE_INIT);
Expand Down
2 changes: 2 additions & 0 deletions lib/inv_bicgstabl_quda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,8 @@ namespace quda {
blas::xpay(b, -1.0, r_full);
r2 = b2; // dummy setting
}
for (auto i = 0u; i < b.size(); i++)
if (b2[i] == 0) b2[i] = r2[i];
blas::copy(y, x); // we accumulate into y
} else {
blas::copy(r_full, b); // r[0] = b
Expand Down
3 changes: 0 additions & 3 deletions lib/milc_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2491,9 +2491,6 @@ void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precis
mg_param.setup_location[i] = QUDA_CUDA_FIELD_LOCATION; // setup_location[i];
}

// whether to run GPU setup but putting temporaries into mapped (slow CPU) memory
mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE;

// coarsening the spin on the first restriction is undefined for staggered fields.
mg_param.spin_block_size[0] = 0;
if (input_struct.optimized_kd == QUDA_TRANSFER_OPTIMIZED_KD
Expand Down
3 changes: 1 addition & 2 deletions lib/multigrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,7 @@ namespace quda
diracParam.dslash_use_mma = param.mg_global.dslash_use_mma[param.level + 1];
diracParam.allow_truncation = (param.mg_global.allow_truncation == QUDA_BOOLEAN_TRUE) ? true : false;

diracCoarseResidual = new DiracCoarse(diracParam, param.setup_location == QUDA_CUDA_FIELD_LOCATION ? true : false,
param.mg_global.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? true : false);
diracCoarseResidual = new DiracCoarse(diracParam, param.setup_location == QUDA_CUDA_FIELD_LOCATION ? true : false);

// create smoothing operators
diracParam.dirac = const_cast<Dirac *>(param.matSmooth->Expose());
Expand Down
2 changes: 2 additions & 0 deletions lib/targets/cuda/malloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,7 @@ namespace quda

void flush_pinned()
{
logQuda(QUDA_DEBUG_VERBOSE, "Flushing host pinned memory pool\n");
if (pinned_memory_pool) {
for (auto it : pinnedCache) { host_free(it.second); }
pinnedCache.clear();
Expand All @@ -782,6 +783,7 @@ namespace quda

void flush_device()
{
logQuda(QUDA_DEBUG_VERBOSE, "Flushing device memory pool\n");
if (device_memory_pool) {
for (auto it : deviceCache) { device_free(it.second); }
deviceCache.clear();
Expand Down
4 changes: 3 additions & 1 deletion lib/targets/hip/malloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@ namespace quda
void flush_pinned()
{
if (pinned_memory_pool) {
logQuda(QUDA_DEBUG_VERBOSE, "Flushing host pinned memory pool\n");
std::multimap<size_t, void *>::iterator it;
for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
void *ptr = it->second;
Expand All @@ -725,7 +726,8 @@ namespace quda
void flush_device()
{
if (device_memory_pool) {
std::multimap<size_t, void *>::iterator it;
logQuda(QUDA_DEBUG_VERBOSE, "Flushing device memory pool\n");
std::multimap<size_t, void *>::iterator it;
for (it = deviceCache.begin(); it != deviceCache.end(); it++) {
void *ptr = it->second;
device_free(ptr);
Expand Down
6 changes: 0 additions & 6 deletions tests/utils/set_params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,9 +609,6 @@ void setMultigridParam(QudaMultigridParam &mg_param)
mg_param.setup_location[i] = setup_location[i];
}

// whether to run GPU setup but putting temporaries into mapped (slow CPU) memory
mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE;

// only coarsen the spin on the first restriction
mg_param.spin_block_size[0] = 2;

Expand Down Expand Up @@ -1226,9 +1223,6 @@ void setStaggeredMultigridParam(QudaMultigridParam &mg_param)
nu_post[i] = 2;
}

// whether to run GPU setup but putting temporaries into mapped (slow CPU) memory
mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE;

// coarsening the spin on the first restriction is undefined for staggered fields.
mg_param.spin_block_size[0] = 0;

Expand Down

0 comments on commit 9765847

Please sign in to comment.