-
Notifications
You must be signed in to change notification settings - Fork 47
Compiling tmLQCD QPhiX DDalphaAMG
Bartosz Kostrzewa edited this page Oct 28, 2018
·
9 revisions
git clone -b master https://github.com/etmc/tmLQCD
git clone -b master https://github.com/sbacchio/DDalphaAMG
git clone -b devel https://github.com/JeffersonLab/qphix
git clone https://github.com/etmc/lemon
git clone https://github.com/usqcd-software/c-lime
git clone https://github.com/usqcd-software/qmp
$ cd ~/code/qmp
$ autoreconf
$ mkdir -p ~/build/qmp && cd ~/build/qmp
$ ~/code/qmp/configure \
--prefix=${HOME}/local/qmp \
--with-qmp-comms-type=MPI \
CC=mpicc \
CFLAGS=-std=c99
$ make -j10 && make install
$ cd ~/code/c-lime
$ ./autogen.sh
$ mkdir -p ~/build/lime && cd ~/build/lime
$ ~/code/c-lime/configure \
--prefix=$HOME/local/lime \
CC=icc
$ make -j10 && make install
$ cd ~/code/lemon
$ autoreconf
$ mkdir -p ~/build/lemon && cd ~/build/lemon
$ ~/code/lemon/configure \
--prefix=${HOME}/local/lemon \
CC=mpicc
$ make -j10 && make install
Ideally compile with Intel compiler, especially for perfect thread pinning with KMP_AFFINITY
.
$ mkdir -p ~/build/qphix && cd ~/build/qphix
$ CXX=mpicc \
CXXFLAGS="-xCORE-AVX2 -fma -std=c++11 -O3 -qopenmp" \
cmake -Disa=avx2 \
-DQMP_DIR=${HOME}/local/qmp \
-Dparallel_arch=parscalar \
-Dhost_cxx=g++ \
-Dhost_cxxflags="-std=c++11 -O3" \
-Dtwisted_mass=TRUE \
-Dtm_clover=TRUE \
-Dclover=TRUE \
-Drecursive_jN=10 \
-Dtesting=FALSE \
-DCMAKE_INSTALL_PREFIX=${HOME}/local/qphix/avx2 ~/code/qphix
$ make -j10 && make install
-
For KNL:
-xMIC-AVX512
and-Disa=avx512
-
For SKL:
-xSKYLAKE-AVX512
and-Disa=avx512
Unfortunately, DDalphaAMG has no proper build system to allow out-of-source builds. Edit the Makefile
as such:
- add
PREFIX = ${HOME}/local/DDalphaAMG
- change to
CC = mpicc
- for
LIMEFLAGS
andLIMELIB
, you can comment out the valuesLIMEFLAGS=#-DHAVE_LIME -I$(LIMEDIR)/include
LIMELIB=# -L$(LIMEDIR)/lib -llime
- you should actually provide
LIMEDIR
if you want to use the binaries that come with DDalphaAMG
In the source directory, then:
make -j10 && make install
The tmLQCD build system unfortunately does not do a proper installation (to be fixed at some point) and you should copy the resulting executables manually.
IMPORTANT: do do use autoreconf
in the tmLQCD source directory. The tmLQCD build system is based on "incomplete" autotools support and the command will break your source directory.
$ cd ~/code/tmLQCD
$ autoconf # see note above!!
$ mkdir -p ~/build/tmLQCD && cd ~/build/tmLQCD
$ ~/code/tmLQCD/configure \
--with-limedir=$HOME/local/lime \
--with-lemondir=$HOME/local/lemon \
--with-qphixdir=$HOME/local/qphix/avx2 --with-qmpdir=$HOME/local/qmp \
--with-DDalphaAMG=${HOME}/local/DDalphaAMG \
--with-lapack=" -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a ${MKLROOT}/lib/intel64/libmkl_core.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm -ldl" \
--with-mpidimension=4 --enable-omp --enable-mpi \
--disable-sse2 --disable-sse3 \
--enable-halfspinor --enable-gaugecopy \
CC=mpicc CXX=mpiCC F77=ifort \
CFLAGS="-O3 -std=c99 -qopenmp -xCORE-AVX2 -fma" \
CXXFLAGS="-O3 -std=c++11 -qopenmp -xCORE-AVX2 -fma" \
LDFLAGS="-qopenmp
$ make -j10
- For KNL:
-xMIC-AVX512
,--enable-qphix-soalen=8
- For SKL:
-xSKYLAKE-AVX512
,--enable-qphix-soalen=8
48c96 twisted clover on 54 KNL nodes, many MPI tasks (32 per node) to compensate for strong-scaling problems on KNL
L=48
T=96
NrXProcs = 3
NrYProcs = 6
NrZprocs = 6
ompnumthreads = 4
# the reversibility check is enabled, so we run only five trajectories per
# job
Measurements = 5
# suppress the acceptance step for this many trajectories
thermalisationsweeps = 0
seed=8721681
EigenvaluePrecision = 1e-7
Startcondition = continue
InitialStoreCounter = readin
2KappaMu = 0.00033615600
2KappaMuBar = 0.03944230400
2KappaEpsBar = 0.04260777300
CSW = 1.74
kappa = 0.140065
NSave = 2
ThetaT = 1.0
UseEvenOdd = yes
userelativeprecision=yes
ReversibilityCheck = yes
ReversibilityCheckIntervall = 20
# for the beginning 3 is fine, afterwards run with 2 because 3 slows down progress by about 10%!
#DebugLevel = 3
DebugLevel = 2
ReproduceRandomNumbers = no
RanluxdLevel = 2
BeginDDalphaAMG
MGBlockX = 4
MGBlockY = 4
MGBlockZ = 4
MGBlockT = 3
MGSetupIter = 3
MGCoarseSetupIter = 3
MGNumberOfVectors = 24
MGNumberOfLevels = 3
MGCoarseMuFactor = 9
MGdtauUpdate = 0.026
MGUpdateSetupIter = 1
mgompnumthreads = 2
EndDDalphaAMG
BeginExternalInverter QPHIX
# physical cores per MPI task
NCores = 2
# block sizes (see qphix papers for details)
By = 8
Bz = 8
MinCt = 1
# thread geometry
# ompnumthreads = NCores * Sy * Sz
# hyperthreads should be specified here
Sy = 1
Sz = 2
# paddings in XY and XYZ blocks
PadXY = 1
PadXYZ = 0
EndExternalInverter
BeginMeasurement CORRELATORS
Frequency = 1
EndMeasurement
BeginMeasurement GRADIENTFLOW
Frequency = 4
EndMeasurement
BeginOperator CLOVER
CSW = 1.74
kappa = 0.140065
2KappaMu = 0.00033615600
solver = DDalphaAMG
SolverPrecision = 1e-18
MaxSolverIterations = 1000
useevenodd = no
EndOperator
BeginMonomial GAUGE
Type = Iwasaki
beta = 1.726
Timescale = 0
EndMonomial
BeginMonomial CLOVERDET
Timescale = 1
kappa = 0.140065
2KappaMu = 0.00033615600
CSW = 1.74
rho = 0.1573086528
MaxSolverIterations = 5000
AcceptancePrecision = 1.e-21
ForcePrecision = 1.e-16
Name = cloverdetlight
solver = mixedcg
useexternalinverter = qphix
usecompression = 12
usesloppyprecision = single
EndMonomial
BeginMonomial CLOVERDETRATIO
Timescale = 2
kappa = 0.140065
2KappaMu = 0.00033615600
# numerator shift
rho = 0.0312600528
rho2 = 0.1573086528
CSW = 1.74
MaxSolverIterations = 5000
AcceptancePrecision = 1.e-21
ForcePrecision = 1.e-16
Name = cloverdetratio1light
solver = mixedcg
useexternalinverter = qphix
usecompression = 12
usesloppyprecision = single
EndMonomial
BeginMonomial CLOVERDETRATIO
Timescale = 3
kappa = 0.140065
2KappaMu = 0.00033615600
# numerator shift
rho = 0.0060503328
rho2 = 0.0312600528
CSW = 1.74
MaxSolverIterations = 60000
AcceptancePrecision = 1.e-21
ForcePrecision = 1.e-18
Name = cloverdetratio2light
solver = mixedcg
useexternalinverter = qphix
usecompression = 12
usesloppyprecision = single
EndMonomial
BeginMonomial CLOVERDETRATIO
Timescale = 4
kappa = 0.140065
2KappaMu = 0.00033615600
rho = 0.0010083888
rho2 = 0.0060503328
CSW = 1.74
MaxSolverIterations = 60000
AcceptancePrecision = 1.e-21
ForcePrecision = 1.e-18
Name = cloverdetratio3light
solver = ddalphaamg
EndMonomial
BeginMonomial CLOVERDETRATIO
Timescale = 5
kappa = 0.140065
2KappaMu = 0.00033615600
rho = 0.0
rho2 = 0.0010083888
CSW = 1.74
MaxSolverIterations = 60000
AcceptancePrecision = 1.e-21
ForcePrecision = 1.e-18
Name = cloverdetratio4light
solver = ddalphaamg
EndMonomial
BeginMonomial NDCLOVERRAT
Timescale = 2
kappa = 0.140065
CSW = 1.74
AcceptancePrecision = 1e-21
ForcePrecision = 1e-16
StildeMin = 0.0000376
StildeMax = 4.7
Name = ndcloverrat1
DegreeOfRational = 10
Cmin = 0
Cmax = 2
ComputeEVFreq = 1
2Kappamubar = 0.03944230400
2Kappaepsbar = 0.04260777300
AddTrLog = yes
useexternalinverter = qphix
usecompression = 12
solver = cgmmsnd
EndMonomial
BeginMonomial NDCLOVERRAT
Timescale = 3
kappa = 0.140065
CSW = 1.74
AcceptancePrecision = 1e-21
ForcePrecision = 1e-16
StildeMin = 0.0000376
StildeMax = 4.7
Name = ndcloverrat2
DegreeOfRational = 10
Cmin = 3
Cmax = 4
ComputeEVFreq = 0
2Kappamubar = 0.03944230400
2Kappaepsbar = 0.04260777300
AddTrLog = no
useexternalinverter = qphix
usecompression = 12
solver = cgmmsnd
EndMonomial
BeginMonomial NDCLOVERRAT
Timescale = 4
kappa = 0.140065
CSW = 1.74
AcceptancePrecision = 1e-21
ForcePrecision = 1e-16
StildeMin = 0.0000376
StildeMax = 4.7
Name = ndcloverrat3
DegreeOfRational = 10
Cmin = 5
Cmax = 6
ComputeEVFreq = 0
2Kappamubar = 0.03944230400
2Kappaepsbar = 0.04260777300
AddTrLog = no
useexternalinverter = qphix
usecompression = 12
solver = cgmmsnd
EndMonomial
BeginMonomial NDCLOVERRAT
Timescale = 5
kappa = 0.140065
CSW = 1.74
AcceptancePrecision = 1e-21
ForcePrecision = 1e-16
StildeMin = 0.0000376
StildeMax = 4.7
Name = ndcloverrat4
DegreeOfRational = 10
Cmin = 7
Cmax = 9
ComputeEVFreq = 0
2Kappamubar = 0.03944230400
2Kappaepsbar = 0.04260777300
AddTrLog = no
useexternalinverter = qphix
usecompression = 12
solver = cgmmsnd
EndMonomial
BeginMonomial NDCLOVERRATCOR
Timescale = 1
kappa = 0.140065
CSW = 1.74
AcceptancePrecision = 1e-21
ForcePrecision = 1e-16
StildeMin = 0.0000376
StildeMax = 4.7
Name = ndcloverratcor
DegreeOfRational = 10
ComputeEVFreq = 0
2Kappamubar = 0.03944230400
2Kappaepsbar = 0.04260777300
useexternalinverter = qphix
usecompression = 12
solver = cgmmsnd
EndMonomial
BeginIntegrator
Type0 = 2MN
Type1 = 2MN
Type2 = 2MN
Type3 = 2MN
Type4 = 2MN
Type5 = 2MN
IntegrationSteps0 = 1
IntegrationSteps1 = 1
IntegrationSteps2 = 1
IntegrationSteps3 = 1
IntegrationSteps4 = 1
IntegrationSteps5 = 17
tau = 1.0
Lambda0 = 0.185
Lambda1 = 0.190
Lambda2 = 0.195
Lambda3 = 0.20
Lambda4 = 0.205
Lambda5 = 0.21
NumberOfTimescales = 6
MonitorForces = yes
EndIntegrator
32c64 on 32 SKL nodes (2x24 cores per node), 8 MPI tasks per node, 6 threads per task, no clover term
L=32
T=64
NrXProcs = 2
NrYProcs = 4
NrZProcs = 4
OMPNumThreads = 6
ReproduceRandomNumbers = no
RanluxdLevel = 2
Measurements = 10
Startcondition = continue
InitialStoreCounter = readin
DisableIOChecks = no
2KappaMu = 0.001305718
2Kappamubar = 0.04896441
2Kappaepsbar = 0.064306592
kappa = 0.1632147
NSave = 2
ThetaT = 1.
UseEvenOdd = yes
UseRelativePrecision = yes
ReversibilityCheck = no
ReversibilityCheckIntervall = 10
DebugLevel = 2
EigenvaluePrecision = 1e-7
BeginExternalInverter QPHIX
# physical cores per MPI task
NCores = 6
# block sizes (see qphix papers for details)
By = 8
Bz = 8
MinCt = 1
# thread geometry
# ompnumthreads = NCores * Sy * Sz
# hyperthreads should be specified here
Sy = 1
Sz = 1
# paddings in XY and XYZ blocks
PadXY = 0
PadXYZ = 0
EndExternalInverter
BeginOperator TMWILSON
2KappaMu = 0.001305718
kappa = 0.1632147
UseEvenOdd = yes
MaxSolverIterations = 10000
SolverPrecision = 1e-18
# QPhiX is faster than DDalphaAMG here because of the setup overhead!
solver = mixedcg
UseExternalInverter = qphix
UseCompression = 12
UseSloppyPrecision = single
EndOperator
BeginMeasurement CORRELATORS
Frequency = 1
EndMeasurement
BeginMeasurement GRADIENTFLOW
Frequency = 4
StepSize = 0.02
EndMeasurement
BeginMonomial GAUGE
Type = Iwasaki
beta = 1.90
Timescale = 0
EndMonomial
BeginMonomial DET
Timescale = 1
2KappaMu = 0.08
kappa = 0.1632147
AcceptancePrecision = 1.e-20
ForcePrecision = 1.e-14
Name = det
solver = mixedcg
UseExternalInverter = qphix
UseCompression = 12
UseSloppyPrecision = single
MaxSolverIterations = 2000
EndMonomial
BeginMonomial DETRATIO
Timescale = 2
kappa = 0.1632147
kappa2 = 0.1632147
2KappaMu = 0.009
2KappaMu2 = 0.08
AcceptancePrecision = 1.e-20
ForcePrecision = 1.e-14
Name = detratio1
solver = mixedcg
UseExternalInverter = qphix
UseCompression = 12
UseSloppyPrecision = single
MaxSolverIterations = 4000
EndMonomial
BeginMonomial DETRATIO
Timescale = 3
kappa = 0.1632147
kappa2 = 0.1632147
2KappaMu = 0.001305718
2KappaMu2 = 0.009
AcceptancePrecision = 1.e-22
ForcePrecision = 1.e-16
Name = detratio2
MaxSolverIterations = 25000
# at this mass, QPhiX is faster than DDalphaAMG here because of the setup update overhead!
solver = mixedcg
UseExternalInverter = qphix
UseCompression = 12
UseSloppyPrecision = single
EndMonomial
BeginMonomial NDRAT
Timescale = 2
StildeMin = 0.000014
StildeMax = 2.8
ComputeEVFreq = 0
2Kappamubar = 0.04896441
2Kappaepsbar = 0.064306592
kappa = 0.1632147
ForcePrecision = 1e-16
AcceptancePrecision = 1e-20
DegreeOfRational = 10
Cmin = 0
Cmax = 5
Name = ndrat_0_5
UseExternalInverter = qphix
UseCompression = 12
Solver = cgmmsnd
EndMonomial
BeginMonomial NDRAT
Timescale = 3
StildeMin = 0.000014
StildeMax = 2.8
ComputeEVFreq = 0
2Kappamubar = 0.04896441
2Kappaepsbar = 0.064306592
kappa = 0.1632147
ForcePrecision = 1e-16
AcceptancePrecision = 1e-20
DegreeOfRational = 10
Cmin = 6
Cmax = 9
Name = ndrat_6_9
UseExternalInverter = qphix
UseCompression = 12
Solver = cgmmsnd
EndMonomial
BeginMonomial NDRATCOR
Timescale = 1
StildeMin = 0.000014
StildeMax = 2.8
ComputeEVFreq = 10
2Kappamubar = 0.04896441
2Kappaepsbar = 0.064306592
kappa = 0.1632147
AcceptancePrecision = 1e-22
DegreeOfRational = 10
Name = ndratcor_0_9
UseExternalInverter = qphix
UseCompression = 12
Solver = cgmmsnd
EndMonomial
BeginIntegrator
Type0 = 2MNFG
Type1 = 2MNFG
Type2 = 2MNFG
Type3 = 2MN
IntegrationSteps0 = 1
IntegrationSteps1 = 1
IntegrationSteps2 = 1
IntegrationSteps3 = 9
tau = 1.
## 2MNFG
Lambda0 = 0.166666667
Lambda1 = 0.166666667
Lambda2 = 0.166666667
## 2MN
Lambda3 = 0.193183326
NumberOfTimescales = 4
MonitorForces = yes
EndIntegrator
On SKL / KNL, proper MPI tasks and thread pinning is mandatory for good performance. Using SLURM, this can be achieved with a variation on this job script:
#!/bin/bash -x
#SBATCH --time=01:30:00
#SBATCH --mem=82G
#SBATCH --nodes=32
#SBATCH --exclusive
# okay, we need SLURM to give us complete control over the task pinning
# this seems to work in order to achieve what we want on an SKL machine
# with 2x24 cores per node and hyperthreading enabled
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
#SBATCH --job-name=yyy
#SBATCH --mail-type=ALL
#SBATCH --mail-user=xxx
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=disable
export HFI_NO_CPUAFFINITY=1
export I_MPI_HYDRA_ENV=all
export I_MPI_PIN=1
## at debug level 4, Intel MPI outputs task pinning layout
## which can be used to confirm that desired layout has been
## obtained
export I_MPI_DEBUG=4
## pin domain extends over hyperthreads (2*Ncores = 2*OMP_NUM_THREADS in this case)
export I_MPI_PIN_DOMAIN=12
export OMP_NUM_THREADS=6
## verbose will output thread pinning, can and should be disabled in
## production because DDalphaAMG and tmLQCD will use different numbers
## of threads, resulting in LOTS of output every time the
## numbers are changed...
# export KMP_AFFINITY="balanced,granularity=fine,verbose"
export KMP_AFFINITY="balanced,granularity=fine"
EXE=tmLQCD/hmc_tm
RUNDIR=job_dir
ODIR=${RUNDIR}/outputs
if [ ! -d ${ODIR} ]; then
mkdir -p ${ODIR}
mkdir ${ODIR}
fi
ifile=32n.8ppn.6tpt.hmc.QPhiX.DDalphaAMG.input
cp ${ifile} ${RUNDIR}
cd ${RUNDIR}
ofile=${ODIR}/out.x2_y4_z4_t6.${SLURM_JOB_NAME}.${SLURM_JOB_ID}.out
date > ${ofile}
## we do not use srun but mpirun provided by Intel MPI
mpirun -n $(( 32 * 8 )) -ppn 8 ${EXE} -f ${ifile} 2>&1 | tee -a ${ofile}
RVAL=$?
date >> ${ofile}