Compiling tmLQCD QPhiX DDalphaAMG

Required sources

git clone -b master https://github.com/etmc/tmLQCD
git clone -b master https://github.com/sbacchio/DDalphaAMG
git clone -b devel https://github.com/JeffersonLab/qphix
git clone https://github.com/etmc/lemon
git clone https://github.com/usqcd-software/c-lime
git clone https://github.com/usqcd-software/qmp

QMP

$ cd ~/code/qmp
$ autoreconf
$ mkdir -p ~/build/qmp && cd ~/build/qmp
$ ~/code/qmp/configure \
  --prefix=${HOME}/local/qmp \
  --with-qmp-comms-type=MPI \
  CC=mpicc \
  CFLAGS=-std=c99

$ make -j10 && make install

LIME

$ cd ~/code/c-lime
$ ./autogen.sh
$ mkdir -p ~/build/lime && cd ~/build/lime
$ ~/code/c-lime/configure \
  --prefix=$HOME/local/lime \
  CC=icc

$ make -j10 && make install

LEMON

$ cd ~/code/lemon
$ autoreconf
$ mkdir -p ~/build/lemon && cd ~/build/lemon
$ ~/code/lemon/configure \
  --prefix=${HOME}/local/lemon \
  CC=mpicc

$ make -j10 && make install

QPHIX

Ideally compile with Intel compiler, especially for perfect thread pinning with KMP_AFFINITY.

$ mkdir -p ~/build/qphix && cd ~/build/qphix
$ CXX=mpicc \
CXXFLAGS="-xCORE-AVX2 -fma -std=c++11 -O3 -qopenmp" \
cmake -Disa=avx2  \
      -DQMP_DIR=${HOME}/local/qmp \
      -Dparallel_arch=parscalar \
      -Dhost_cxx=g++ \
      -Dhost_cxxflags="-std=c++11 -O3" \
      -Dtwisted_mass=TRUE \
      -Dtm_clover=TRUE \
      -Dclover=TRUE \
      -Drecursive_jN=10 \
      -Dtesting=FALSE  \
      -DCMAKE_INSTALL_PREFIX=${HOME}/local/qphix/avx2 ~/code/qphix

$ make -j10 && make install

For KNL: -xMIC-AVX512 and -Disa=avx512
For SKL: -xSKYLAKE-AVX512 and -Disa=avx512

DDalphaAMG

Unfortunately, DDalphaAMG has no proper build system to allow out-of-source builds. Edit the Makefile as such:

add PREFIX = ${HOME}/local/DDalphaAMG
change to CC = mpicc
for LIMEFLAGS and LIMELIB, you can comment out the values
- LIMEFLAGS=#-DHAVE_LIME -I$(LIMEDIR)/include
- LIMELIB=# -L$(LIMEDIR)/lib -llime
- you should actually provide LIMEDIR if you want to use the binaries that come with DDalphaAMG

In the source directory, then:

make -j10 && make install

tmLQCD

The tmLQCD build system unfortunately does not do a proper installation (to be fixed at some point) and you should copy the resulting executables manually.

IMPORTANT: do do use autoreconf in the tmLQCD source directory. The tmLQCD build system is based on "incomplete" autotools support and the command will break your source directory.

$ cd ~/code/tmLQCD
$ autoconf         # see note above!!
$ mkdir -p ~/build/tmLQCD && cd ~/build/tmLQCD
$ ~/code/tmLQCD/configure \
  --with-limedir=$HOME/local/lime \
  --with-lemondir=$HOME/local/lemon \
  --with-qphixdir=$HOME/local/qphix/avx2 --with-qmpdir=$HOME/local/qmp \
  --with-DDalphaAMG=${HOME}/local/DDalphaAMG \
  --with-lapack=" -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a ${MKLROOT}/lib/intel64/libmkl_core.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm -ldl" \
  --with-mpidimension=4 --enable-omp --enable-mpi \
  --disable-sse2 --disable-sse3 \
  --enable-halfspinor --enable-gaugecopy \
  CC=mpicc CXX=mpiCC F77=ifort \
  CFLAGS="-O3 -std=c99 -qopenmp -xCORE-AVX2 -fma" \
  CXXFLAGS="-O3 -std=c++11 -qopenmp -xCORE-AVX2 -fma" \
  LDFLAGS="-qopenmp

$ make -j10

For KNL: -xMIC-AVX512, --enable-qphix-soalen=8
For SKL: -xSKYLAKE-AVX512, --enable-qphix-soalen=8

Input Files

48c96 twisted clover on 54 KNL nodes, many MPI tasks (32 per node) to compensate for strong-scaling problems on KNL

L=48
T=96

NrXProcs = 3
NrYProcs = 6
NrZprocs = 6
ompnumthreads = 4

# the reversibility check is enabled, so we run only five trajectories per
# job
Measurements = 5

# suppress the acceptance step for this many trajectories
thermalisationsweeps = 0

seed=8721681

EigenvaluePrecision = 1e-7

Startcondition = continue
InitialStoreCounter = readin

2KappaMu = 0.00033615600
2KappaMuBar = 0.03944230400
2KappaEpsBar = 0.04260777300
CSW = 1.74
kappa = 0.140065
NSave = 2
ThetaT = 1.0
UseEvenOdd = yes
userelativeprecision=yes

ReversibilityCheck = yes
ReversibilityCheckIntervall = 20

# for the beginning 3 is fine, afterwards run with 2 because 3 slows down progress by about 10%!
#DebugLevel = 3
DebugLevel = 2

ReproduceRandomNumbers = no
RanluxdLevel = 2

BeginDDalphaAMG
  MGBlockX = 4
  MGBlockY = 4
  MGBlockZ = 4
  MGBlockT = 3
  MGSetupIter = 3
  MGCoarseSetupIter = 3
  MGNumberOfVectors = 24
  MGNumberOfLevels = 3
  MGCoarseMuFactor = 9
  MGdtauUpdate = 0.026
  MGUpdateSetupIter = 1
  mgompnumthreads = 2
EndDDalphaAMG

BeginExternalInverter QPHIX
  # physical cores per MPI task
  NCores = 2
  
  # block sizes (see qphix papers for details)
  By = 8
  Bz = 8
  MinCt = 1
  
  # thread geometry
  # ompnumthreads = NCores * Sy * Sz
  # hyperthreads should be specified here
  Sy = 1
  Sz = 2
  
  # paddings in XY and XYZ blocks
  PadXY = 1
  PadXYZ = 0
EndExternalInverter  

BeginMeasurement CORRELATORS
  Frequency = 1
EndMeasurement

BeginMeasurement GRADIENTFLOW
  Frequency = 4
EndMeasurement

BeginOperator CLOVER
  CSW = 1.74
  kappa = 0.140065
  2KappaMu = 0.00033615600
  solver = DDalphaAMG
  SolverPrecision = 1e-18
  MaxSolverIterations = 1000
  useevenodd = no
EndOperator

BeginMonomial GAUGE
  Type = Iwasaki
  beta = 1.726
  Timescale = 0
EndMonomial

BeginMonomial CLOVERDET
  Timescale = 1
  kappa = 0.140065
  2KappaMu = 0.00033615600
  CSW = 1.74
  rho = 0.1573086528
  MaxSolverIterations = 5000
  AcceptancePrecision =  1.e-21
  ForcePrecision = 1.e-16
  Name = cloverdetlight
  solver = mixedcg
  useexternalinverter = qphix
  usecompression = 12
  usesloppyprecision = single
EndMonomial

BeginMonomial CLOVERDETRATIO
  Timescale = 2
  kappa = 0.140065
  2KappaMu = 0.00033615600
  # numerator shift
  rho = 0.0312600528
  rho2 = 0.1573086528
  CSW = 1.74
  MaxSolverIterations = 5000
  AcceptancePrecision =  1.e-21
  ForcePrecision = 1.e-16
  Name = cloverdetratio1light
  solver = mixedcg
  useexternalinverter = qphix
  usecompression = 12
  usesloppyprecision = single
EndMonomial

BeginMonomial CLOVERDETRATIO
  Timescale = 3
  kappa = 0.140065
  2KappaMu = 0.00033615600
  # numerator shift
  rho = 0.0060503328
  rho2 = 0.0312600528
  CSW = 1.74
  MaxSolverIterations = 60000
  AcceptancePrecision =  1.e-21
  ForcePrecision = 1.e-18
  Name = cloverdetratio2light
  solver = mixedcg
  useexternalinverter = qphix
  usecompression = 12
  usesloppyprecision = single
EndMonomial

BeginMonomial CLOVERDETRATIO
  Timescale = 4
  kappa = 0.140065
  2KappaMu = 0.00033615600
  rho = 0.0010083888
  rho2 = 0.0060503328
  CSW = 1.74
  MaxSolverIterations = 60000
  AcceptancePrecision =  1.e-21
  ForcePrecision = 1.e-18
  Name = cloverdetratio3light
  solver = ddalphaamg
EndMonomial

BeginMonomial CLOVERDETRATIO
  Timescale = 5
  kappa = 0.140065
  2KappaMu = 0.00033615600
  rho = 0.0
  rho2 = 0.0010083888
  CSW = 1.74
  MaxSolverIterations = 60000
  AcceptancePrecision =  1.e-21
  ForcePrecision = 1.e-18
  Name = cloverdetratio4light
  solver = ddalphaamg
EndMonomial

BeginMonomial NDCLOVERRAT
  Timescale = 2
  kappa = 0.140065
  CSW = 1.74
  AcceptancePrecision =  1e-21
  ForcePrecision = 1e-16
  StildeMin = 0.0000376
  StildeMax = 4.7
  Name = ndcloverrat1
  DegreeOfRational = 10
  Cmin = 0
  Cmax = 2
  ComputeEVFreq = 1
  2Kappamubar = 0.03944230400
  2Kappaepsbar = 0.04260777300
  AddTrLog = yes
  useexternalinverter = qphix
  usecompression = 12
  solver = cgmmsnd
EndMonomial

BeginMonomial NDCLOVERRAT
  Timescale = 3
  kappa = 0.140065
  CSW = 1.74
  AcceptancePrecision =  1e-21
  ForcePrecision = 1e-16
  StildeMin = 0.0000376
  StildeMax = 4.7
  Name = ndcloverrat2
  DegreeOfRational = 10
  Cmin = 3
  Cmax = 4
  ComputeEVFreq = 0
  2Kappamubar = 0.03944230400
  2Kappaepsbar = 0.04260777300
  AddTrLog = no
  useexternalinverter = qphix
  usecompression = 12
  solver = cgmmsnd
EndMonomial

BeginMonomial NDCLOVERRAT
  Timescale = 4
  kappa = 0.140065
  CSW = 1.74
  AcceptancePrecision =  1e-21
  ForcePrecision = 1e-16
  StildeMin = 0.0000376
  StildeMax = 4.7
  Name = ndcloverrat3
  DegreeOfRational = 10
  Cmin = 5
  Cmax = 6
  ComputeEVFreq = 0
  2Kappamubar = 0.03944230400
  2Kappaepsbar = 0.04260777300
  AddTrLog = no
  useexternalinverter = qphix
  usecompression = 12
  solver = cgmmsnd
EndMonomial

BeginMonomial NDCLOVERRAT
  Timescale = 5
  kappa = 0.140065
  CSW = 1.74
  AcceptancePrecision =  1e-21
  ForcePrecision = 1e-16
  StildeMin = 0.0000376
  StildeMax = 4.7
  Name = ndcloverrat4
  DegreeOfRational = 10
  Cmin = 7
  Cmax = 9
  ComputeEVFreq = 0
  2Kappamubar = 0.03944230400
  2Kappaepsbar = 0.04260777300
  AddTrLog = no
  useexternalinverter = qphix
  usecompression = 12
  solver = cgmmsnd
EndMonomial

BeginMonomial NDCLOVERRATCOR
  Timescale = 1
  kappa = 0.140065
  CSW = 1.74
  AcceptancePrecision =  1e-21
  ForcePrecision = 1e-16
  StildeMin = 0.0000376
  StildeMax = 4.7
  Name = ndcloverratcor
  DegreeOfRational = 10
  ComputeEVFreq = 0
  2Kappamubar = 0.03944230400
  2Kappaepsbar = 0.04260777300
  useexternalinverter = qphix
  usecompression = 12
  solver = cgmmsnd
EndMonomial

BeginIntegrator 
  Type0 = 2MN
  Type1 = 2MN
  Type2 = 2MN
  Type3 = 2MN
  Type4 = 2MN
  Type5 = 2MN
  IntegrationSteps0 = 1
  IntegrationSteps1 = 1
  IntegrationSteps2 = 1
  IntegrationSteps3 = 1
  IntegrationSteps4 = 1
  IntegrationSteps5 = 17
  tau = 1.0
  Lambda0 = 0.185
  Lambda1 = 0.190
  Lambda2 = 0.195
  Lambda3 = 0.20
  Lambda4 = 0.205
  Lambda5 = 0.21
  NumberOfTimescales = 6
  MonitorForces = yes
EndIntegrator

32c64 on 32 SKL nodes (2x24 cores per node), 8 MPI tasks per node, 6 threads per task, no clover term

L=32
T=64
NrXProcs = 2
NrYProcs = 4
NrZProcs = 4
OMPNumThreads = 6

ReproduceRandomNumbers = no
RanluxdLevel = 2

Measurements = 10
Startcondition = continue
InitialStoreCounter = readin
DisableIOChecks = no

2KappaMu = 0.001305718
2Kappamubar = 0.04896441
2Kappaepsbar = 0.064306592
kappa = 0.1632147
NSave = 2
ThetaT = 1.
UseEvenOdd = yes
UseRelativePrecision = yes

ReversibilityCheck = no
ReversibilityCheckIntervall = 10

DebugLevel = 2
EigenvaluePrecision = 1e-7

BeginExternalInverter QPHIX
  # physical cores per MPI task
  NCores = 6
  
  # block sizes (see qphix papers for details)
  By = 8
  Bz = 8
  MinCt = 1
  
  # thread geometry
  # ompnumthreads = NCores * Sy * Sz
  # hyperthreads should be specified here
  Sy = 1
  Sz = 1
  
  # paddings in XY and XYZ blocks
  PadXY = 0
  PadXYZ = 0
EndExternalInverter

BeginOperator TMWILSON
  2KappaMu = 0.001305718
  kappa = 0.1632147
  UseEvenOdd = yes
  MaxSolverIterations = 10000
  SolverPrecision = 1e-18
  # QPhiX is faster than DDalphaAMG here because of the setup overhead!
  solver = mixedcg
  UseExternalInverter = qphix
  UseCompression = 12
  UseSloppyPrecision = single
EndOperator

BeginMeasurement CORRELATORS
  Frequency = 1
EndMeasurement

BeginMeasurement GRADIENTFLOW
  Frequency = 4
  StepSize = 0.02
EndMeasurement

BeginMonomial GAUGE
  Type = Iwasaki
  beta = 1.90
  Timescale = 0
EndMonomial

BeginMonomial DET
  Timescale = 1
  2KappaMu = 0.08
  kappa = 0.1632147
  AcceptancePrecision =  1.e-20
  ForcePrecision = 1.e-14
  Name = det
  solver = mixedcg
  UseExternalInverter = qphix
  UseCompression = 12
  UseSloppyPrecision = single
  MaxSolverIterations = 2000
EndMonomial

BeginMonomial DETRATIO
  Timescale = 2
  kappa = 0.1632147
  kappa2 = 0.1632147
  2KappaMu = 0.009
  2KappaMu2 = 0.08
  AcceptancePrecision =  1.e-20
  ForcePrecision = 1.e-14
  Name = detratio1
  solver = mixedcg
  UseExternalInverter = qphix
  UseCompression = 12
  UseSloppyPrecision = single
  MaxSolverIterations = 4000
EndMonomial

BeginMonomial DETRATIO
  Timescale = 3
  kappa = 0.1632147
  kappa2 = 0.1632147
  2KappaMu = 0.001305718
  2KappaMu2 = 0.009
  AcceptancePrecision =  1.e-22
  ForcePrecision = 1.e-16
  Name = detratio2
  MaxSolverIterations = 25000
  # at this mass, QPhiX is faster than DDalphaAMG here because of the setup update overhead!
  solver = mixedcg
  UseExternalInverter = qphix
  UseCompression = 12
  UseSloppyPrecision = single
EndMonomial

BeginMonomial NDRAT
  Timescale = 2
  StildeMin = 0.000014
  StildeMax = 2.8
  ComputeEVFreq = 0
  2Kappamubar = 0.04896441
  2Kappaepsbar = 0.064306592
  kappa = 0.1632147
  ForcePrecision = 1e-16
  AcceptancePrecision = 1e-20
  DegreeOfRational = 10
  Cmin = 0
  Cmax = 5
  Name = ndrat_0_5
  UseExternalInverter = qphix
  UseCompression = 12
  Solver = cgmmsnd
EndMonomial

BeginMonomial NDRAT
  Timescale = 3
  StildeMin = 0.000014
  StildeMax = 2.8
  ComputeEVFreq = 0
  2Kappamubar = 0.04896441
  2Kappaepsbar = 0.064306592
  kappa = 0.1632147
  ForcePrecision = 1e-16
  AcceptancePrecision = 1e-20
  DegreeOfRational = 10
  Cmin = 6
  Cmax = 9
  Name = ndrat_6_9
  UseExternalInverter = qphix
  UseCompression = 12
  Solver = cgmmsnd
EndMonomial

BeginMonomial NDRATCOR
  Timescale = 1
  StildeMin = 0.000014
  StildeMax = 2.8
  ComputeEVFreq = 10
  2Kappamubar = 0.04896441
  2Kappaepsbar = 0.064306592
  kappa = 0.1632147
  AcceptancePrecision = 1e-22
  DegreeOfRational = 10
  Name = ndratcor_0_9
  UseExternalInverter = qphix
  UseCompression = 12
  Solver = cgmmsnd
EndMonomial

BeginIntegrator 
  Type0 = 2MNFG
  Type1 = 2MNFG
  Type2 = 2MNFG
  Type3 = 2MN
  IntegrationSteps0 = 1
  IntegrationSteps1 = 1
  IntegrationSteps2 = 1
  IntegrationSteps3 = 9
  tau = 1.
  ## 2MNFG
  Lambda0 = 0.166666667
  Lambda1 = 0.166666667
  Lambda2 = 0.166666667
  ## 2MN
  Lambda3 = 0.193183326
  NumberOfTimescales = 4
  MonitorForces = yes
EndIntegrator

SLURM Jobscripts for perfect pinning using Intel MPI

On SKL / KNL, proper MPI tasks and thread pinning is mandatory for good performance. Using SLURM, this can be achieved with a variation on this job script:

#!/bin/bash -x
#SBATCH --time=01:30:00
#SBATCH --mem=82G
#SBATCH --nodes=32
#SBATCH --exclusive
# okay, we need SLURM to give us complete control over the task pinning
# this seems to work in order to achieve what we want on an SKL machine
# with 2x24 cores per node and hyperthreading enabled
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
#SBATCH --job-name=yyy
#SBATCH --mail-type=ALL
#SBATCH --mail-user=xxx

export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=disable
export HFI_NO_CPUAFFINITY=1

export I_MPI_HYDRA_ENV=all
export I_MPI_PIN=1

## at debug level 4, Intel MPI outputs task pinning layout
## which can be used to confirm that desired layout has been
## obtained
export I_MPI_DEBUG=4

## pin domain extends over hyperthreads (2*Ncores = 2*OMP_NUM_THREADS in this case)
export I_MPI_PIN_DOMAIN=12
export OMP_NUM_THREADS=6

## verbose will output thread pinning, can and should be disabled in
## production because DDalphaAMG and tmLQCD will use different numbers
## of threads, resulting in LOTS of output every time the
## numbers are changed...
# export KMP_AFFINITY="balanced,granularity=fine,verbose"

export KMP_AFFINITY="balanced,granularity=fine"

EXE=tmLQCD/hmc_tm

RUNDIR=job_dir
ODIR=${RUNDIR}/outputs

if [ ! -d ${ODIR} ]; then
  mkdir -p ${ODIR}
  mkdir ${ODIR}
fi

ifile=32n.8ppn.6tpt.hmc.QPhiX.DDalphaAMG.input

cp ${ifile} ${RUNDIR}
cd ${RUNDIR}

ofile=${ODIR}/out.x2_y4_z4_t6.${SLURM_JOB_NAME}.${SLURM_JOB_ID}.out
date > ${ofile}

## we do not use srun but mpirun provided by Intel MPI
mpirun -n $(( 32 * 8 )) -ppn 8 ${EXE} -f ${ifile} 2>&1 | tee -a ${ofile}

RVAL=$?
date >> ${ofile}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly