From 243703d4e691f1e63ac6402e942ea08b7d708492 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 8 Nov 2023 09:04:39 +0100
Subject: [PATCH 01/12] Initial addition of a benchmark to measure the diff
 performance for a NN.

---
 benchmark/CMakeLists.txt |  1 +
 benchmark/nn_diff.cpp    | 59 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 benchmark/nn_diff.cpp
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 5725f244f..0999797cf 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -43,6 +43,7 @@ ADD_HEYOKA_BENCHMARK(taylor_ANN)
 ADD_HEYOKA_BENCHMARK(taylor_ANN_2)
 ADD_HEYOKA_BENCHMARK(apophis)
 ADD_HEYOKA_BENCHMARK(stiff_equation)
+ADD_HEYOKA_BENCHMARK(nn_diff)
 ADD_HEYOKA_BENCHMARK(mascon_models)
 ADD_HEYOKA_BENCHMARK(outer_ss_jet_benchmark)
 ADD_HEYOKA_BENCHMARK(outer_ss_long_term)
diff --git a/benchmark/nn_diff.cpp b/benchmark/nn_diff.cpp
new file mode 100644
index 000000000..5494ce1d8
--- /dev/null
+++ b/benchmark/nn_diff.cpp
@@ -0,0 +1,59 @@
+// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <stdexcept>
+
+#include <boost/program_options.hpp>
+
+#include <spdlog/spdlog.h>
+
+#include <heyoka/expression.hpp>
+#include <heyoka/kw.hpp>
+#include <heyoka/logging.hpp>
+#include <heyoka/math/tanh.hpp>
+#include <heyoka/model/ffnn.hpp>
+
+using namespace heyoka;
+
+int main(int argc, char *argv[])
+{
+    namespace po = boost::program_options;
+
+    create_logger();
+    set_logger_level_trace();
+
+    auto logger = spdlog::get("heyoka");
+
+    unsigned nn_layer{};
+
+    po::options_description desc("Options");
+
+    desc.add_options()("help", "produce help message")("nn_layer", po::value<unsigned>(&nn_layer)->default_value(10u),
+                                                       "number of neurons per layer");
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help") != 0u) {
+        std::cout << desc << "\n";
+        return 0;
+    }
+
+    if (nn_layer == 0u) {
+        throw std::invalid_argument("The number of neurons per layer cannot be zero");
+    }
+
+    auto [x, y] = make_vars("x", "y");
+    auto ffnn = model::ffnn(kw::inputs = {x, y}, kw::nn_hidden = {nn_layer, nn_layer, nn_layer}, kw::n_out = 2,
+                            kw::activations = {heyoka::tanh, heyoka::tanh, heyoka::tanh, heyoka::tanh});
+
+    auto dt = diff_tensors(ffnn, kw::diff_args = diff_args::params);
+    auto dNdtheta = dt.get_jacobian();
+}

From 814ff7596f0ec4b0d206c4bb49d7eb5654c0d3a9 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 8 Nov 2023 09:08:41 +0100
Subject: [PATCH 02/12] Change the logic of diff_tensors() not to use a
 std::map internally.

Rather, the derivatives for each order are added in unordered fashion
and sorted only at the end of an iteration.
---
 src/expression_diff.cpp | 117 +++++++++++++++++++++++++++-------------
 1 file changed, 79 insertions(+), 38 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index e96f519b6..9fa6fc168 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -13,7 +13,6 @@
 #include <deque>
 #include <functional>
 #include <iterator>
-#include <map>
 #include <memory>
 #include <ostream>
 #include <stdexcept>
@@ -27,6 +26,7 @@
 #include <vector>
 
 #include <boost/container/container_fwd.hpp>
+#include <boost/container_hash/hash.hpp>
 #include <boost/numeric/conversion/cast.hpp>
 #include <boost/safe_numerics/safe_integer.hpp>
 
@@ -34,6 +34,7 @@
 #include <fmt/ranges.h>
 
 #include <heyoka/config.hpp>
+#include <heyoka/detail/fast_unordered.hpp>
 #include <heyoka/detail/func_cache.hpp>
 #include <heyoka/detail/logging_impl.hpp>
 #include <heyoka/detail/string_conv.hpp>
@@ -450,11 +451,26 @@ auto diff_make_adj_dep(const std::vector<expression> &dc, std::vector<expression
     return std::tuple{std::move(adj), std::move(dep), std::move(revdep), std::move(subs_map)};
 }
 
+// Hasher for the local maps of derivatives used in the
+// forward/reverse mode implementations.
+struct diff_map_hasher {
+    std::size_t operator()(const dtens_v_idx_t &v) const noexcept
+    {
+        std::size_t seed = std::hash<decltype(v.size())>{}(v.size());
+
+        for (auto idx : v) {
+            boost::hash_combine(seed, std::hash<decltype(idx)>{}(idx));
+        }
+
+        return seed;
+    }
+};
+
 // Forward-mode implementation of diff_tensors().
 template <typename DiffMap, typename Dep, typename Adj>
 void diff_tensors_forward_impl(
-    // The map of derivatives. It wil be updated as new derivatives
-    // are computed.
+    // The map of derivatives. It will be updated after all the
+    // derivatives have been computed.
     DiffMap &diff_map,
     // The number of derivatives in the previous-order tensor.
     std::vector<expression>::size_type cur_nouts,
@@ -478,6 +494,10 @@ void diff_tensors_forward_impl(
 {
     assert(dc.size() > nvars);
 
+    // Local map used to temporarily store the derivatives.
+    // It will be merged into diff_map at the end.
+    fast_umap<dtens_v_idx_t, expression, diff_map_hasher> local_diff_map;
+
     // An indices vector used as temporary variable in several places below.
     std::vector<std::uint32_t> tmp_v_idx;
 
@@ -523,7 +543,7 @@ void diff_tensors_forward_impl(
 
                 // Use try_emplace() so that if the derivative
                 // has already been computed, nothing happens.
-                diff_map.try_emplace(tmp_v_idx, 0_dbl);
+                local_diff_map.try_emplace(tmp_v_idx, 0_dbl);
             }
 
             // Move to the next diff argument.
@@ -625,7 +645,7 @@ void diff_tensors_forward_impl(
         }
 
         // Add the derivatives of all outputs wrt the current input
-        // to diff_map.
+        // to local_diff_map.
         auto out_it = prev_begin;
 
         for (std::vector<expression>::size_type out_idx = 0; out_idx < cur_nouts; ++out_idx, ++out_it) {
@@ -636,22 +656,25 @@ void diff_tensors_forward_impl(
             tmp_v_idx[diff_arg_idx + 1u] += 1u;
 
             // Check if we already computed this derivative.
-            if (const auto it = diff_map.find(tmp_v_idx); it == diff_map.end()) {
+            if (const auto it = local_diff_map.find(tmp_v_idx); it == local_diff_map.end()) {
                 // The derivative is new.
                 auto cur_der = diffs[diffs.size() - cur_nouts + out_idx];
 
-                [[maybe_unused]] const auto [_, flag] = diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
+                [[maybe_unused]] const auto [_, flag] = local_diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
                 assert(flag);
             }
         }
     }
+
+    // Merge local_diff_map into diff_map.
+    diff_map.insert(diff_map.end(), local_diff_map.begin(), local_diff_map.end());
 }
 
 // Reverse-mode implementation of diff_tensors().
 template <typename DiffMap, typename Dep, typename Adj>
 void diff_tensors_reverse_impl(
-    // The map of derivatives. It wil be updated as new derivatives
-    // are computed.
+    // The map of derivatives. It will be updated after all the
+    // derivatives have been computed.
     DiffMap &diff_map,
     // The number of derivatives in the previous-order tensor.
     std::vector<expression>::size_type cur_nouts,
@@ -675,6 +698,10 @@ void diff_tensors_reverse_impl(
 {
     assert(dc.size() > nvars);
 
+    // Local map used to temporarily store the derivatives.
+    // It will be merged into diff_map at the end.
+    fast_umap<dtens_v_idx_t, expression, diff_map_hasher> local_diff_map;
+
     // Cache the number of diff arguments.
     const auto nargs = args.size();
 
@@ -813,7 +840,7 @@ void diff_tensors_reverse_impl(
             assert(flag);
         }
 
-        // Add the derivatives to diff_map.
+        // Add the derivatives to local_diff_map.
         for (decltype(args.size()) j = 0; j < nargs; ++j) {
             // Compute the indices vector for the current derivative.
             tmp_v_idx = prev_begin->first;
@@ -824,7 +851,7 @@ void diff_tensors_reverse_impl(
             tmp_v_idx[j + 1u] += 1u;
 
             // Check if we already computed this derivative.
-            if (const auto it = diff_map.find(tmp_v_idx); it == diff_map.end()) {
+            if (const auto it = local_diff_map.find(tmp_v_idx); it == local_diff_map.end()) {
                 // The derivative is new. If the diff argument is present in the
                 // decomposition, then we will calculate the derivative and add it.
                 // Otherwise, we set the derivative to zero and add it.
@@ -834,24 +861,18 @@ void diff_tensors_reverse_impl(
                     cur_der = it_dmap->second;
                 }
 
-                [[maybe_unused]] const auto [_, flag] = diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
+                [[maybe_unused]] const auto [_, flag] = local_diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
                 assert(flag);
             }
         }
 
         // Update prev_begin as we move to the next output.
-        // NOTE; the iterator here stays valid even if we keep on modifying
-        // diff_map thanks to the iterator invalidation rules for associative
-        // containers (i.e., we never call erase, we just keep on inserting new
-        // elements).
-        // NOTE: prev_begin is iterating over the previous-order derivatives
-        // without interference from the new derivatives we are adding, since
-        // they are all higher-order derivatives (and thus appended past the end
-        // of the previous order derivative range).
         ++prev_begin;
-
-        assert(prev_begin != diff_map.end());
+        assert(prev_begin != diff_map.end() || i + 1u == cur_nouts);
     }
+
+    // Merge local_diff_map into diff_map.
+    diff_map.insert(diff_map.end(), local_diff_map.begin(), local_diff_map.end());
 }
 
 } // namespace
@@ -1003,9 +1024,23 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
     // each diff args. E.g., with diff args = [x, y, z],
     // then [0, 1, 2, 1] means d4f0/(dx dy**2 dz) (where f0 is the
     // first component of the vector function f).
-    // Using a std::map is handy for iterative construction, it will
-    // be turned into an equivalent flat_map at the end.
-    std::map<dtens_v_idx_t, expression, dtens_v_idx_cmp> diff_map;
+    // This will be kept manually sorted according to dtens_v_idx_cmp
+    // and it will be turned into a flat map at the end.
+    dtens_map_t::sequence_type diff_map;
+
+    // Helper to locate the vector of indices v in diff_map. If not present,
+    // diff_map.end() will be returned.
+    auto search_diff_map = [&diff_map](const dtens_v_idx_t &v) {
+        auto it = std::lower_bound(diff_map.begin(), diff_map.end(), v, [](const auto &item, const auto &vec) {
+            return dtens_v_idx_cmp{}(item.first, vec);
+        });
+
+        if (it != diff_map.end() && it->first == v) {
+            return it;
+        } else {
+            return diff_map.end();
+        }
+    };
 
     // An indices vector with preallocated storage,
     // used as temporary variable in several places below.
@@ -1021,8 +1056,8 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
     for (decltype(v_ex.size()) i = 0; i < orig_nouts; ++i) {
         tmp_v_idx[0] = boost::numeric_cast<std::uint32_t>(i);
 
-        assert(diff_map.count(tmp_v_idx) == 0u);
-        diff_map[tmp_v_idx] = v_ex[i];
+        assert(search_diff_map(tmp_v_idx) == diff_map.end());
+        diff_map.emplace_back(tmp_v_idx, v_ex[i]);
     }
 
     // Iterate over the derivative orders.
@@ -1032,7 +1067,7 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         tmp_v_idx[0] = 0;
         tmp_v_idx[1] = cur_order;
         std::fill(tmp_v_idx.begin() + 2, tmp_v_idx.end(), static_cast<std::uint32_t>(0));
-        const auto prev_begin = diff_map.find(tmp_v_idx);
+        const auto prev_begin = search_diff_map(tmp_v_idx);
         assert(prev_begin != diff_map.end());
 
         // Store the previous-order derivatives into a separate
@@ -1051,6 +1086,10 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         // Create the adjoints, the direct/reverse dependencies and the substitution map.
         const auto [adj, dep, revdep, subs_map] = diff_make_adj_dep(dc, nvars, cur_nouts);
 
+        // Store the current diff_map size in order to (later) determine
+        // where the set of derivatives for the current order begins.
+        const auto orig_diff_map_size = diff_map.size();
+
         spdlog::stopwatch sw_inner;
 
         // NOTE: in order to choose between forward and reverse mode, we adopt the standard approach
@@ -1063,22 +1102,23 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
             diff_tensors_reverse_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin);
         }
 
+        // Determine the range in diff_map for the current-order derivatives.
+        auto *cur_begin = diff_map.data() + orig_diff_map_size;
+        auto *cur_end = diff_map.data() + diff_map.size();
+
+        // Sort the derivatives for the current order.
+        std::sort(cur_begin, cur_end,
+                  [](const auto &p1, const auto &p2) { return dtens_v_idx_cmp{}(p1.first, p2.first); });
+
         // NOTE: the derivatives we just added to diff_map are still expressed in terms of u variables.
         // We need to apply the substitution map subs_map in order to recover the expressions in terms
         // of the original variables. It is important that we do this now (rather then when constructing
         // the derivatives in diff_tensors_*_impl()) because now we can do the substitution in a vectorised
         // fashion, which greatly reduces the internal redundancy of the resulting expressions.
 
-        // Locate in diff_map the iterator to where we began adding derivatives for the current order.
-        tmp_v_idx[0] = 0;
-        tmp_v_idx[1] = cur_order + 1u;
-        std::fill(tmp_v_idx.begin() + 2, tmp_v_idx.end(), static_cast<std::uint32_t>(0));
-        const auto new_begin = diff_map.find(tmp_v_idx);
-        assert(new_begin != diff_map.end());
-
         // Create the vector of expressions for the substitution.
         std::vector<expression> subs_ret;
-        for (auto it = new_begin; it != diff_map.end(); ++it) {
+        for (auto *it = cur_begin; it != cur_end; ++it) {
             subs_ret.push_back(it->second);
         }
 
@@ -1087,7 +1127,7 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
 
         // Replace the original expressions in diff_map.
         decltype(subs_ret.size()) i = 0;
-        for (auto it = new_begin; i < subs_ret.size(); ++i, ++it) {
+        for (auto *it = cur_begin; i < subs_ret.size(); ++i, ++it) {
             it->second = subs_ret[i];
         }
 
@@ -1111,7 +1151,8 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
     // the expressions returned by models, for consistency.
 
     // Assemble and return the result.
-    dtens_map_t retval(boost::container::ordered_unique_range_t{}, diff_map.begin(), diff_map.end());
+    dtens_map_t retval;
+    retval.adopt_sequence(boost::container::ordered_unique_range_t{}, std::move(diff_map));
 
     // Check sorting.
     assert(std::is_sorted(retval.begin(), retval.end(),

From ceadde0ccbd19cdef4f6255116442aca0658bfd2 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 8 Nov 2023 09:17:01 +0100
Subject: [PATCH 03/12] Replace several usages of std::unordered with the fast
 counterparts from Boost in the diff_tensors() code.

---
 src/expression_diff.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 9fa6fc168..4477f86b3 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -336,7 +336,7 @@ auto diff_make_adj_dep(const std::vector<expression> &dc, std::vector<expression
     // Do an initial pass to create the adjoints, the
     // vectors of direct and reverse dependencies,
     // and the substitution map.
-    std::vector<std::unordered_map<std::uint32_t, expression>> adj;
+    std::vector<fast_umap<std::uint32_t, expression>> adj;
     adj.resize(boost::numeric_cast<decltype(adj.size())>(dc.size()));
 
     std::vector<std::vector<std::uint32_t>> dep;
@@ -506,7 +506,7 @@ void diff_tensors_forward_impl(
     // to avoid iterating over those subexpressions which do not depend on
     // an input. We need two containers (with identical content)
     // because we need both ordered iteration AND fast lookup.
-    std::unordered_set<std::uint32_t> in_deps;
+    fast_uset<std::uint32_t> in_deps;
     std::vector<std::uint32_t> sorted_in_deps;
 
     // A stack to be used when filling up in_deps/sorted_in_deps.
@@ -515,7 +515,7 @@ void diff_tensors_forward_impl(
     // Create a dictionary mapping an input to its position
     // in the decomposition. This is used to locate diff arguments
     // in the decomposition.
-    std::unordered_map<expression, std::vector<expression>::size_type> input_idx_map;
+    fast_umap<expression, std::vector<expression>::size_type, std::hash<expression>> input_idx_map;
     for (std::vector<expression>::size_type i = 0; i < nvars; ++i) {
         const auto &cur_in = dc[i];
         assert(input_idx_map.count(cur_in) == 0u);
@@ -714,7 +714,7 @@ void diff_tensors_reverse_impl(
     // does not depend (recall that the decomposition contains the subexpressions
     // for ALL outputs). We need two containers (with identical content)
     // because we need both ordered iteration AND fast lookup.
-    std::unordered_set<std::uint32_t> out_deps;
+    fast_uset<std::uint32_t> out_deps;
     std::vector<std::uint32_t> sorted_out_deps;
 
     // A stack to be used when filling up out_deps/sorted_out_deps.
@@ -834,7 +834,7 @@ void diff_tensors_reverse_impl(
         // to fetch from diffs only the derivatives we are interested in
         // (since there may be vars/params in the decomposition wrt which
         // the derivatives are not requested).
-        std::unordered_map<expression, expression> dmap;
+        fast_umap<expression, expression, std::hash<expression>> dmap;
         for (std::vector<expression>::size_type j = 0; j < nvars; ++j) {
             [[maybe_unused]] const auto [_, flag] = dmap.try_emplace(dc[j], diffs[j]);
             assert(flag);
@@ -1225,7 +1225,7 @@ dtens diff_tensors(const std::vector<expression> &v_ex, const std::variant<diff_
     }
 
     // Check if there are repeated entries in args.
-    const std::unordered_set args_set(args.begin(), args.end());
+    const fast_uset<expression, std::hash<expression>> args_set(args.begin(), args.end());
     if (args_set.size() != args.size()) {
         throw std::invalid_argument(
             fmt::format("Duplicate entries detected in the list of variables/parameters with respect to which the "

From 4978b36cdc1e79df5e4afc6f581857fcdc1aa476 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 8 Nov 2023 09:26:00 +0100
Subject: [PATCH 04/12] Parallelise diff sorting.

---
 src/expression_diff.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 4477f86b3..448213006 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -33,6 +33,8 @@
 #include <fmt/core.h>
 #include <fmt/ranges.h>
 
+#include <oneapi/tbb/parallel_sort.h>
+
 #include <heyoka/config.hpp>
 #include <heyoka/detail/fast_unordered.hpp>
 #include <heyoka/detail/func_cache.hpp>
@@ -1107,8 +1109,8 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         auto *cur_end = diff_map.data() + diff_map.size();
 
         // Sort the derivatives for the current order.
-        std::sort(cur_begin, cur_end,
-                  [](const auto &p1, const auto &p2) { return dtens_v_idx_cmp{}(p1.first, p2.first); });
+        tbb::parallel_sort(cur_begin, cur_end,
+                           [](const auto &p1, const auto &p2) { return dtens_v_idx_cmp{}(p1.first, p2.first); });
 
         // NOTE: the derivatives we just added to diff_map are still expressed in terms of u variables.
         // We need to apply the substitution map subs_map in order to recover the expressions in terms

From b3bdb324c024579ce503347da6bd2453fd07dc61 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 8 Nov 2023 09:28:50 +0100
Subject: [PATCH 05/12] Internal rename.

---
 src/expression_diff.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 448213006..89c504f9f 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -1109,8 +1109,8 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         auto *cur_end = diff_map.data() + diff_map.size();
 
         // Sort the derivatives for the current order.
-        tbb::parallel_sort(cur_begin, cur_end,
-                           [](const auto &p1, const auto &p2) { return dtens_v_idx_cmp{}(p1.first, p2.first); });
+        oneapi::tbb::parallel_sort(
+            cur_begin, cur_end, [](const auto &p1, const auto &p2) { return dtens_v_idx_cmp{}(p1.first, p2.first); });
 
         // NOTE: the derivatives we just added to diff_map are still expressed in terms of u variables.
         // We need to apply the substitution map subs_map in order to recover the expressions in terms

From e1614ff9c507f31916b9b79aa3c4261d4857b8e2 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 8 Nov 2023 09:48:16 +0100
Subject: [PATCH 06/12] Typos.

---
 doc/changelog.rst       | 2 +-
 src/expression_diff.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 695db4cb3..952d6d02c 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -150,7 +150,7 @@ Changes
 ~~~~~~~
 
 - The step callbacks are now copied in :ref:`ensemble propagations <tut_ensemble>`
-  rather then being shared among threads. The aim of this change
+  rather than being shared among threads. The aim of this change
   is to reduce the likelihood of data races
   (`#334 <https://github.com/bluescarni/heyoka/pull/334>`__).
 - Comprehensive overhaul of the expression system, including:
diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 89c504f9f..c1ac51527 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -1114,7 +1114,7 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
 
         // NOTE: the derivatives we just added to diff_map are still expressed in terms of u variables.
         // We need to apply the substitution map subs_map in order to recover the expressions in terms
-        // of the original variables. It is important that we do this now (rather then when constructing
+        // of the original variables. It is important that we do this now (rather than when constructing
         // the derivatives in diff_tensors_*_impl()) because now we can do the substitution in a vectorised
         // fashion, which greatly reduces the internal redundancy of the resulting expressions.
 

From b7fc1d0bf99377bf95c5412f0e1f764d5e5d9bc9 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 8 Nov 2023 12:09:15 +0100
Subject: [PATCH 07/12] Avoid storing derivatives in maps when we are at the
 first order.

---
 src/expression_diff.cpp | 131 ++++++++++++++++++++++++++++++----------
 1 file changed, 99 insertions(+), 32 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index c1ac51527..3334595f7 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -492,13 +492,30 @@ void diff_tensors_forward_impl(
     const std::vector<expression> &args,
     // Iterator in diff_map pointing to the first
     // derivative for the previous order.
-    typename DiffMap::iterator prev_begin)
+    typename DiffMap::iterator prev_begin,
+    // The current derivative order.
+    std::uint32_t cur_order)
 {
     assert(dc.size() > nvars);
-
-    // Local map used to temporarily store the derivatives.
-    // It will be merged into diff_map at the end.
-    fast_umap<dtens_v_idx_t, expression, diff_map_hasher> local_diff_map;
+    assert(cur_order > 0u);
+
+    // Local data structures used to temporarily store the derivatives,
+    // which will eventually be added to diff_map.
+    // For derivative orders > 1, the algorithm we employ
+    // will produce several times the same derivative, and thus
+    // we need to store the derivatives in a dictionary in order
+    // to prevent duplicates. For order-1 derivatives, no duplicate
+    // derivatives will be produced and thus we can use a plain vector
+    // which can be quite a bit faster.
+    using diff_map_t = fast_umap<dtens_v_idx_t, expression, diff_map_hasher>;
+    using diff_vec_t = std::vector<std::pair<dtens_v_idx_t, expression>>;
+    using local_diff_t = std::variant<diff_map_t, diff_vec_t>;
+    auto local_diff = (cur_order == 1u) ? local_diff_t(diff_vec_t{}) : local_diff_t(diff_map_t{});
+
+    // Helpers to ease the access to the active member of the local_diff variant.
+    // NOTE: if used incorrectly, these will throw at runtime.
+    auto local_dmap = [&local_diff]() -> diff_map_t & { return std::get<diff_map_t>(local_diff); };
+    auto local_dvec = [&local_diff]() -> diff_vec_t & { return std::get<diff_vec_t>(local_diff); };
 
     // An indices vector used as temporary variable in several places below.
     std::vector<std::uint32_t> tmp_v_idx;
@@ -530,6 +547,7 @@ void diff_tensors_forward_impl(
     for (decltype(args.size()) diff_arg_idx = 0; diff_arg_idx < args.size(); ++diff_arg_idx) {
         const auto &cur_diff_arg = args[diff_arg_idx];
 
+        // Check if the current diff argument is one of the inputs.
         if (input_idx_map.count(cur_diff_arg) == 0u) {
             // The diff argument is not one of the inputs:
             // set the derivatives of all outputs wrt to the
@@ -543,9 +561,13 @@ void diff_tensors_forward_impl(
                 assert(diff_arg_idx + 1u < tmp_v_idx.size());
                 tmp_v_idx[diff_arg_idx + 1u] += 1u;
 
-                // Use try_emplace() so that if the derivative
-                // has already been computed, nothing happens.
-                local_diff_map.try_emplace(tmp_v_idx, 0_dbl);
+                if (cur_order == 1u) {
+                    local_dvec().emplace_back(tmp_v_idx, 0_dbl);
+                } else {
+                    // NOTE: use try_emplace() so that if the derivative
+                    // has already been computed, nothing happens.
+                    local_dmap().try_emplace(tmp_v_idx, 0_dbl);
+                }
             }
 
             // Move to the next diff argument.
@@ -647,7 +669,7 @@ void diff_tensors_forward_impl(
         }
 
         // Add the derivatives of all outputs wrt the current input
-        // to local_diff_map.
+        // to the local map.
         auto out_it = prev_begin;
 
         for (std::vector<expression>::size_type out_idx = 0; out_idx < cur_nouts; ++out_idx, ++out_it) {
@@ -657,19 +679,28 @@ void diff_tensors_forward_impl(
             assert(diff_arg_idx + 1u < tmp_v_idx.size());
             tmp_v_idx[diff_arg_idx + 1u] += 1u;
 
-            // Check if we already computed this derivative.
-            if (const auto it = local_diff_map.find(tmp_v_idx); it == local_diff_map.end()) {
-                // The derivative is new.
+            if (cur_order == 1u) {
                 auto cur_der = diffs[diffs.size() - cur_nouts + out_idx];
+                local_dvec().emplace_back(tmp_v_idx, std::move(cur_der));
+            } else {
+                // Check if we already computed this derivative.
+                if (const auto it = local_dmap().find(tmp_v_idx); it == local_dmap().end()) {
+                    // The derivative is new.
+                    auto cur_der = diffs[diffs.size() - cur_nouts + out_idx];
 
-                [[maybe_unused]] const auto [_, flag] = local_diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
-                assert(flag);
+                    [[maybe_unused]] const auto [_, flag] = local_dmap().try_emplace(tmp_v_idx, std::move(cur_der));
+                    assert(flag);
+                }
             }
         }
     }
 
-    // Merge local_diff_map into diff_map.
-    diff_map.insert(diff_map.end(), local_diff_map.begin(), local_diff_map.end());
+    // Merge the local map into diff_map.
+    if (cur_order == 1u) {
+        diff_map.insert(diff_map.end(), local_dvec().begin(), local_dvec().end());
+    } else {
+        diff_map.insert(diff_map.end(), local_dmap().begin(), local_dmap().end());
+    }
 }
 
 // Reverse-mode implementation of diff_tensors().
@@ -696,13 +727,30 @@ void diff_tensors_reverse_impl(
     const std::vector<expression> &args,
     // Iterator in diff_map pointing to the first
     // derivative for the previous order.
-    typename DiffMap::iterator prev_begin)
+    typename DiffMap::iterator prev_begin,
+    // The current derivative order.
+    std::uint32_t cur_order)
 {
     assert(dc.size() > nvars);
-
-    // Local map used to temporarily store the derivatives.
-    // It will be merged into diff_map at the end.
-    fast_umap<dtens_v_idx_t, expression, diff_map_hasher> local_diff_map;
+    assert(cur_order > 0u);
+
+    // Local data structures used to temporarily store the derivatives,
+    // which will eventually be added to diff_map.
+    // For derivative orders > 1, the algorithm we employ
+    // will produce several times the same derivative, and thus
+    // we need to store the derivatives in a dictionary in order
+    // to prevent duplicates. For order-1 derivatives, no duplicate
+    // derivatives will be produced and thus we can use a plain vector
+    // which can be quite a bit faster.
+    using diff_map_t = fast_umap<dtens_v_idx_t, expression, diff_map_hasher>;
+    using diff_vec_t = std::vector<std::pair<dtens_v_idx_t, expression>>;
+    using local_diff_t = std::variant<diff_map_t, diff_vec_t>;
+    auto local_diff = (cur_order == 1u) ? local_diff_t(diff_vec_t{}) : local_diff_t(diff_map_t{});
+
+    // Helpers to ease the access to the active member of the local_diff variant.
+    // NOTE: if used incorrectly, these will throw at runtime.
+    auto local_dmap = [&local_diff]() -> diff_map_t & { return std::get<diff_map_t>(local_diff); };
+    auto local_dvec = [&local_diff]() -> diff_vec_t & { return std::get<diff_vec_t>(local_diff); };
 
     // Cache the number of diff arguments.
     const auto nargs = args.size();
@@ -842,7 +890,7 @@ void diff_tensors_reverse_impl(
             assert(flag);
         }
 
-        // Add the derivatives to local_diff_map.
+        // Add the derivatives to the local map.
         for (decltype(args.size()) j = 0; j < nargs; ++j) {
             // Compute the indices vector for the current derivative.
             tmp_v_idx = prev_begin->first;
@@ -852,10 +900,9 @@ void diff_tensors_reverse_impl(
             // is representable by std::uint32_t.
             tmp_v_idx[j + 1u] += 1u;
 
-            // Check if we already computed this derivative.
-            if (const auto it = local_diff_map.find(tmp_v_idx); it == local_diff_map.end()) {
-                // The derivative is new. If the diff argument is present in the
-                // decomposition, then we will calculate the derivative and add it.
+            if (cur_order == 1u) {
+                // Check if the diff argument is present in the
+                // decomposition: if it is, we will calculate the derivative and add it.
                 // Otherwise, we set the derivative to zero and add it.
                 expression cur_der = 0_dbl;
 
@@ -863,8 +910,22 @@ void diff_tensors_reverse_impl(
                     cur_der = it_dmap->second;
                 }
 
-                [[maybe_unused]] const auto [_, flag] = local_diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
-                assert(flag);
+                local_dvec().emplace_back(tmp_v_idx, std::move(cur_der));
+            } else {
+                // Check if we already computed this derivative.
+                if (const auto it = local_dmap().find(tmp_v_idx); it == local_dmap().end()) {
+                    // The derivative is new. If the diff argument is present in the
+                    // decomposition, then we will calculate the derivative and add it.
+                    // Otherwise, we set the derivative to zero and add it.
+                    expression cur_der = 0_dbl;
+
+                    if (const auto it_dmap = dmap.find(args[j]); it_dmap != dmap.end()) {
+                        cur_der = it_dmap->second;
+                    }
+
+                    [[maybe_unused]] const auto [_, flag] = local_dmap().try_emplace(tmp_v_idx, std::move(cur_der));
+                    assert(flag);
+                }
             }
         }
 
@@ -873,8 +934,12 @@ void diff_tensors_reverse_impl(
         assert(prev_begin != diff_map.end() || i + 1u == cur_nouts);
     }
 
-    // Merge local_diff_map into diff_map.
-    diff_map.insert(diff_map.end(), local_diff_map.begin(), local_diff_map.end());
+    // Merge the local map into diff_map.
+    if (cur_order == 1u) {
+        diff_map.insert(diff_map.end(), local_dvec().begin(), local_dvec().end());
+    } else {
+        diff_map.insert(diff_map.end(), local_dmap().begin(), local_dmap().end());
+    }
 }
 
 } // namespace
@@ -1099,9 +1164,11 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         // would be to do the computation in both modes (e.g., in parallel) and pick the mode which
         // results in the shortest decomposition. Perhaps we can consider this for a future extension.
         if (cur_nouts >= args.size()) {
-            diff_tensors_forward_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin);
+            diff_tensors_forward_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin,
+                                      cur_order + 1u);
         } else {
-            diff_tensors_reverse_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin);
+            diff_tensors_reverse_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin,
+                                      cur_order + 1u);
         }
 
         // Determine the range in diff_map for the current-order derivatives.

From 87a23a9a53387f5e4a3f89678672b67532cf48c8 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 15:48:18 +0100
Subject: [PATCH 08/12] Minor.

---
 src/expression_diff.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 3334595f7..5959efa4f 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -505,7 +505,7 @@ void diff_tensors_forward_impl(
     // will produce several times the same derivative, and thus
     // we need to store the derivatives in a dictionary in order
     // to prevent duplicates. For order-1 derivatives, no duplicate
-    // derivatives will be produced and thus we can use a plain vector
+    // derivatives will be produced and thus we can use a plain vector,
     // which can be quite a bit faster.
     using diff_map_t = fast_umap<dtens_v_idx_t, expression, diff_map_hasher>;
     using diff_vec_t = std::vector<std::pair<dtens_v_idx_t, expression>>;
@@ -740,7 +740,7 @@ void diff_tensors_reverse_impl(
     // will produce several times the same derivative, and thus
     // we need to store the derivatives in a dictionary in order
     // to prevent duplicates. For order-1 derivatives, no duplicate
-    // derivatives will be produced and thus we can use a plain vector
+    // derivatives will be produced and thus we can use a plain vector,
     // which can be quite a bit faster.
     using diff_map_t = fast_umap<dtens_v_idx_t, expression, diff_map_hasher>;
     using diff_vec_t = std::vector<std::pair<dtens_v_idx_t, expression>>;

From 99664eceb0f29cf60268ab24853b879a71f5e470 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 22:07:57 +0100
Subject: [PATCH 09/12] Fix quadratic complexity when building a ffnn model.

---
 src/model/ffnn.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/model/ffnn.cpp b/src/model/ffnn.cpp
index 4e5358021..435b8839b 100644
--- a/src/model/ffnn.cpp
+++ b/src/model/ffnn.cpp
@@ -19,6 +19,7 @@
 
 #include <heyoka/config.hpp>
 #include <heyoka/expression.hpp>
+#include <heyoka/math/sum.hpp>
 #include <heyoka/model/ffnn.hpp>
 
 HEYOKA_BEGIN_NAMESPACE
@@ -43,21 +44,28 @@ std::vector<expression> compute_layer(su32 layer_id, const std::vector<expressio
     auto n_neurons_prev_layer = su32(inputs.size());
     auto n_neurons_curr_layer = n_neurons[layer_id];
 
-    std::vector<expression> retval(static_cast<std::vector<expression>::size_type>(n_neurons_curr_layer), 0_dbl);
+    std::vector<expression> retval, tmp_sum;
+    retval.reserve(n_neurons_curr_layer);
+
     for (su32 i = 0; i < n_neurons_curr_layer; ++i) {
+        // Clear the summation terms.
+        tmp_sum.clear();
+
         for (su32 j = 0; j < n_neurons_prev_layer; ++j) {
 
             // Add the weight and update the weight counter.
-            retval[i] += nn_wb[wcounter] * inputs[j];
+            tmp_sum.push_back(nn_wb[wcounter] * inputs[j]);
             ++wcounter;
         }
 
         // Add the bias and update the counter.
-        retval[i] += nn_wb[bcounter + n_net_w];
+        tmp_sum.push_back(nn_wb[bcounter + n_net_w]);
         ++bcounter;
+
         // Activation function.
-        retval[i] = activation(retval[i]);
+        retval.push_back(activation(sum(tmp_sum)));
     }
+
     return retval;
 }
 

From 889e8e0baf00f0d57447c166572a16c64c13da54 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 22:16:45 +0100
Subject: [PATCH 10/12] Coverage fixes.

---
 src/expression_diff.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 5959efa4f..f2b52111d 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -749,7 +749,10 @@ void diff_tensors_reverse_impl(
 
     // Helpers to ease the access to the active member of the local_diff variant.
     // NOTE: if used incorrectly, these will throw at runtime.
-    auto local_dmap = [&local_diff]() -> diff_map_t & { return std::get<diff_map_t>(local_diff); };
+    // NOTE: currently local_dmap is never used because the heuristic
+    // for deciding between forward and reverse mode prevents reverse mode
+    // from being used for order > 1.
+    auto local_dmap = [&local_diff]() -> diff_map_t & { return std::get<diff_map_t>(local_diff); }; // LCOV_EXCL_LINE
     auto local_dvec = [&local_diff]() -> diff_vec_t & { return std::get<diff_vec_t>(local_diff); };
 
     // Cache the number of diff arguments.
@@ -912,6 +915,7 @@ void diff_tensors_reverse_impl(
 
                 local_dvec().emplace_back(tmp_v_idx, std::move(cur_der));
             } else {
+                // LCOV_EXCL_START
                 // Check if we already computed this derivative.
                 if (const auto it = local_dmap().find(tmp_v_idx); it == local_dmap().end()) {
                     // The derivative is new. If the diff argument is present in the
@@ -926,6 +930,7 @@ void diff_tensors_reverse_impl(
                     [[maybe_unused]] const auto [_, flag] = local_dmap().try_emplace(tmp_v_idx, std::move(cur_der));
                     assert(flag);
                 }
+                // LCOV_EXCL_STOP
             }
         }
 
@@ -938,7 +943,7 @@ void diff_tensors_reverse_impl(
     if (cur_order == 1u) {
         diff_map.insert(diff_map.end(), local_dvec().begin(), local_dvec().end());
     } else {
-        diff_map.insert(diff_map.end(), local_dmap().begin(), local_dmap().end());
+        diff_map.insert(diff_map.end(), local_dmap().begin(), local_dmap().end()); // LCOV_EXCL_LINE
     }
 }
 

From 360af6185faadbd6324e87495bb3379717d4c29b Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 22:18:37 +0100
Subject: [PATCH 11/12] Update changelog.

---
 doc/changelog.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 7c605e168..9a8e42762 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -21,6 +21,9 @@ New
 Changes
 ~~~~~~~
 
+- Substantial speedups in the computation of first-order derivatives
+  with respect to many variables/parameters
+  (`#358 <https://github.com/bluescarni/heyoka/pull/358>`__).
 - Substantial performance improvements in the computation of
   derivative tensors of large expressions with a high degree
   of internal redundancy

From 2832c9e69da937a27ad9981776da7e651dc32091 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 23:33:09 +0100
Subject: [PATCH 12/12] Dox fix.

---
 doc/advanced_tutorials.rst | 2 +-
 doc/basic_tutorials.rst    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/advanced_tutorials.rst b/doc/advanced_tutorials.rst
index ad1d9a419..02833671f 100644
--- a/doc/advanced_tutorials.rst
+++ b/doc/advanced_tutorials.rst
@@ -5,7 +5,7 @@ Advanced tutorials
 
 .. important::
 
-  More :ref:`tutorials <hypy:adv_tutorials>` and :ref:`examples <hypy:examples>` are available in the documentation
+  More tutorials and examples are available in the documentation
   of heyoka's `Python bindings <https://bluescarni.github.io/heyoka.py>`__.
 
 In this section we will show some of heyoka's more advanced functionalities,
diff --git a/doc/basic_tutorials.rst b/doc/basic_tutorials.rst
index 5c69e1a2f..951a8d6ef 100644
--- a/doc/basic_tutorials.rst
+++ b/doc/basic_tutorials.rst
@@ -5,7 +5,7 @@ Basic tutorials
 
 .. important::
 
-   More :ref:`tutorials <hypy:basic_tutorials>` and :ref:`examples <hypy:examples>` are available in the documentation
+   More tutorials and examples are available in the documentation
    of heyoka's `Python bindings <https://bluescarni.github.io/heyoka.py>`__.
 
 The code snippets in these tutorials assume the inclusion of the