MRChemSoft · gitpeterwind · Feb 21, 2024 · Nov 26, 2023 · Nov 30, 2023 · Nov 30, 2023
diff --git a/doc/users/betzy_example.job b/doc/users/betzy_example.job
@@ -3,6 +3,5 @@
 #SBATCH --tasks-per-node=12
 
 export UCX_LOG_LEVEL=ERROR
-export OMP_NUM_THREADS=15
 
 ~/my_path/to/mrchem --launcher='mpirun --rank-by node --map-by socket --bind-to numa' h2o
diff --git a/doc/users/running.rst b/doc/users/running.rst
@@ -102,6 +102,8 @@ the code on 16 threads (all sharing the same physical memory space)::
 
     $ OMP_NUM_THREADS=16 mrchem h2o
 
+Note that this is the number of threads will be set by ``OMP_NUM_THREADS`` only
+if the code is compiled without MPI support, see below.
 
 Distributed memory MPI
 ++++++++++++++++++++++
@@ -131,10 +133,12 @@ as it will be literally prepended to the ``mrchem.x`` command when the
     each `NUMA <https://en.wikipedia.org/wiki/Non-uniform_memory_access>`_
     domain (usually one per socket) of your CPU, and MPI across NUMA domains and
     ultimately machines. Ideally, the number of OpenMP threads should be
-    between 8-20. E.g. on hardware with two sockets of 16 cores each, use
-    OMP_NUM_THREADS=16 and scale the number of MPI processes by the size
+    between 8-20. E.g. on hardware with two sockets of 16 cores each, scale
+    the number of MPI processes by the size
     of the molecule, typically one process per ~5 orbitals or so (and
     definitely not *more* than one process per orbital).
+    The actual number of threads will be set automatically regardless of the
+    value of ``OMP_NUM_THREADS``.
 
 
 Job example (Betzy)
@@ -173,10 +177,6 @@ a very small molecule for such setup!).
   assigned to any other core, which would result in much reduced performance). The 16
   cores of the group may then be used by the threads initiated by that MPI process.
 
-``--oversubscribe``
-  To tell MPI that it is should accept that the number of MPI processes times
-  the number of threads is larger than the number of available cores.
-
 **Advanced option**:
 Alternatively one can get full control of task placement using the Slurm workload
 manager by replacing ``mpirun`` with ``srun`` and setting explicit CPU masks as::

diff --git a/doc/users/schema_input.json b/doc/users/schema_input.json
@@ -24,6 +24,7 @@
   },
   "mpi": {                                   # Section for MPI specification
     "bank_size": int,                        # Number of MPI ranks in memory bank
+    "omp_threads": int,                      # Number of omp threads
     "numerically_exact": bool,               # Guarantee MPI invariant results
     "shared_memory_size": int                # Size (MB) of MPI shared memory blocks
   },

diff --git a/doc/users/user_inp.rst b/doc/users/user_inp.rst
@@ -115,6 +115,7 @@ This section defines some parameters that are used in MPI runs (defaults shown):
 
     MPI {
       bank_size = -1                        # Number of processes used as memory bank
+      omp_threads = -1                      # Number of omp threads to use
       numerically_exact = false             # Guarantee MPI invariant results
       share_nuclear_potential = false       # Use MPI shared memory window
       share_coulomb_potential = false       # Use MPI shared memory window
@@ -131,6 +132,11 @@ it is likely more efficient to set `bank_size = 0`, otherwise it's recommended
 to use the default. If a particular calculation runs out of memory, it might
 help to increase the number of bank processes from the default value.
 
+The number of threads to use in OpenMP can be forced using the omp_threads flag.
+For MPI runs, it is strongly advised to leave the default, as the optimal value
+can be difficult to guess. The environment variable OMP_NUM_THREADS is not used
+for MPI runs.
+
 The ``numerically_exact`` keyword will trigger algorithms that guarantee that
 the computed results are invariant (within double precision) with respect to
 the number or MPI processes. These exact algorithms require more memory and are
@@ -680,4 +686,3 @@ avoid overwriting the files by default). So, in order to use MW orbitals from a
 previous calculation, you must either change one of the paths
 (``Response.path_orbitals`` or ``Files.guess_X_p`` etc), or manually copy the
 files between the default locations.
-
diff --git a/doc/users/user_ref.rst b/doc/users/user_ref.rst
@@ -231,7 +231,13 @@ User input reference
 
     **Default** ``-1``
 
- :Basis: Define polynomial basis. 
+   :omp_threads: Force the number of OpenMP threads.
+
+    **Type** ``int``
+
+    **Default** ``-1``
+
+    :Basis: Define polynomial basis. 
 
   :red:`Keywords`
    :order: Polynomial order of multiwavelet basis. Negative value means it will be set automatically based on the world precision. 
@@ -12203,4 +12209,4 @@ User input reference
         **Type** ``float``
 
         **Default** ``0.00011186082063``
-
+
diff --git a/external/upstream/fetch_mrcpp.cmake b/external/upstream/fetch_mrcpp.cmake
@@ -39,7 +39,7 @@ else()
     GIT_REPOSITORY
       https://github.com/MRChemSoft/mrcpp.git
     GIT_TAG
-      f8def0a086da6410e5dd8e078de4f6b6305b6ea3
+      720133372c9717134c5a01e963cb9804a1e8c36e
   )
 
   FetchContent_GetProperties(mrcpp_sources)

diff --git a/python/mrchem/api.py b/python/mrchem/api.py
@@ -68,6 +68,7 @@ def write_mpi(user_dict):
         "numerically_exact": user_dict["MPI"]["numerically_exact"],
         "shared_memory_size": user_dict["MPI"]["shared_memory_size"],
         "bank_size": user_dict["MPI"]["bank_size"],
+        "omp_threads": user_dict["MPI"]["omp_threads"],
     }
     return mpi_dict
 

diff --git a/python/mrchem/input_parser/api.py b/python/mrchem/input_parser/api.py
@@ -227,6 +227,9 @@ def stencil() -> JSONDict:
                                             'type': 'bool'},
                                         {   'default': -1,
                                             'name': 'bank_size',
+                                            'type': 'int'},
+                                        {   'default': -1,
+                                            'name': 'omp_threads',
                                             'type': 'int'}],
                         'name': 'MPI'},
                     {   'keywords': [   {   'default': -1,

diff --git a/python/template.yml b/python/template.yml
@@ -192,6 +192,11 @@ sections:
         default: -1
         docstring: |
           Number of MPI processes exclusively dedicated to manage orbital bank.
+      - name: omp_threads
+        type: int
+        default: -1
+        docstring: |
+          Force the number of omp threads (leave default is better).
   - name: Basis
     docstring: |
       Define polynomial basis.

diff --git a/src/initial_guess/core.cpp b/src/initial_guess/core.cpp
@@ -227,37 +227,7 @@ void initial_guess::core::project_ao(OrbitalVector &Phi, double prec, const Nucl
 void initial_guess::core::rotate_orbitals(OrbitalVector &Psi, double prec, ComplexMatrix &U, OrbitalVector &Phi) {
     if (Psi.size() == 0) return;
     Timer t_tot;
-
-    // To get MPI invariant results we cannot crop until all terms are added
-    auto part_prec = (mrcpp::mpi::numerically_exact) ? -1.0 : prec;
-
-    OrbitalIterator iter(Phi);
-    while (true) {
-        // for some unknown reason, iter.next() does not work here for the parallel version
-        if (mrcpp::mpi::wrk_size == 1) {
-            if (iter.next() < 1) break;
-        } else {
-            if (iter.bank_next() < 1) break;
-        }
-        for (auto j = 0; j < Psi.size(); j++) {
-            if (not mrcpp::mpi::my_orb(j)) continue;
-            std::vector<mrcpp::ComplexFunction> func_vec;
-            ComplexVector coef_vec(iter.get_size());
-            for (auto i = 0; i < iter.get_size(); i++) {
-                auto idx_i = iter.idx(i);
-                auto &recv_i = iter.orbital(i);
-                coef_vec[i] = U(idx_i, j);
-                func_vec.push_back(recv_i);
-            }
-            auto tmp_j = Psi[j].paramCopy();
-            mrcpp::cplxfunc::linear_combination(tmp_j, coef_vec, func_vec, part_prec);
-            Psi[j].add(1.0, tmp_j); // In place addition
-            Psi[j].crop(part_prec);
-        }
-    }
-    if (mrcpp::mpi::numerically_exact)
-        for (auto &psi : Psi) psi.crop(prec);
-
+    mrcpp::mpifuncvec::rotate(Phi, U, Psi, prec);
     mrcpp::print::time(1, "Rotating orbitals", t_tot);
 }
 

diff --git a/src/mrdft/Factory.cpp b/src/mrdft/Factory.cpp
@@ -77,6 +77,8 @@ std::unique_ptr<MRDFT> Factory::build() {
         if (lda) func_p = std::make_unique<LDA>(order, xcfun_p);
     }
     if (func_p == nullptr) MSG_ABORT("Invalid functional type");
+    diff_p = std::make_unique<mrcpp::ABGVOperator<3>>(mra, 0.0, 0.0);
+    func_p->setDerivOp(diff_p);
     func_p->setLogGradient(log_grad);
     func_p->setDensityCutoff(cutoff);