diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
index 278c2c74e..0534c9cb5 100644
--- a/.github/workflows/gha_ci.yml
+++ b/.github/workflows/gha_ci.yml
@@ -38,7 +38,7 @@ jobs:
           mkdir build
           cd build
           cmake ../ -G "Visual Studio 16 2019" -A x64 -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DBoost_NO_BOOST_CMAKE=ON -DHEYOKA_WITH_SLEEF=yes -DMPPP_GMP_INCLUDE_DIR=C:\Miniconda\envs\test\Library\include -DMPPP_GMP_LIBRARY=C:\Miniconda\envs\test\Library\lib\mpir.lib
-          cmake --build . --config Release
+          cmake --build . --config Release -j2
           copy Release\heyoka.dll test\Release\
           ctest -j4 -V -C Release
   windows_2019_llvm13:
@@ -60,7 +60,7 @@ jobs:
           mkdir build
           cd build
           cmake ../ -G "Visual Studio 16 2019" -A x64 -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DBoost_NO_BOOST_CMAKE=ON -DHEYOKA_WITH_SLEEF=yes -DMPPP_GMP_INCLUDE_DIR=C:\Miniconda\envs\test\Library\include -DMPPP_GMP_LIBRARY=C:\Miniconda\envs\test\Library\lib\mpir.lib
-          cmake --build . --config Release
+          cmake --build . --config Release -j2
           copy Release\heyoka.dll test\Release\
           ctest -j4 -V -C Release
   windows_2019_llvm14:
@@ -82,7 +82,7 @@ jobs:
           mkdir build
           cd build
           cmake ../ -G "Visual Studio 16 2019" -A x64 -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DBoost_NO_BOOST_CMAKE=ON -DHEYOKA_WITH_SLEEF=yes -DMPPP_GMP_INCLUDE_DIR=C:\Miniconda\envs\test\Library\include -DMPPP_GMP_LIBRARY=C:\Miniconda\envs\test\Library\lib\mpir.lib
-          cmake --build . --config Release
+          cmake --build . --config Release -j2
           copy Release\heyoka.dll test\Release\
           ctest -j4 -V -C Release
   conda_release_static:
diff --git a/README.md b/README.md
index 44a9b2d96..6db5534e2 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ heyoka is a C++ library for the integration of ordinary differential equations
 (ODEs) via Taylor's method, based on automatic differentiation techniques and aggressive just-in-time
 compilation via [LLVM](https://llvm.org/). Notable features include:
 
-* support for double-precision, extended-precision (80-bit and 128-bit),
+* support for single-precision, double-precision, extended-precision (80-bit and 128-bit),
   and arbitrary-precision floating-point types,
 * the ability to maintain machine precision accuracy over
   tens of billions of timesteps,
diff --git a/doc/changelog.rst b/doc/changelog.rst
index 22d8e63e8..afb92a7bb 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -7,6 +7,8 @@ Changelog
 New
 ~~~
 
+- Add support for single-precision computations
+  (`#363 <https://github.com/bluescarni/heyoka/pull/363>`__).
 - Add model implementing the ELP2000 analytical lunar theory
   (`#362 <https://github.com/bluescarni/heyoka/pull/362>`__).
 
diff --git a/doc/index.rst b/doc/index.rst
index 520be023f..ed11ce020 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -17,7 +17,7 @@ heyoka is a C++ library for the integration of ordinary differential equations
 on automatic differentiation techniques and aggressive just-in-time
 compilation via `LLVM <https://llvm.org/>`__. Notable features include:
 
-* support for double-precision, extended-precision (80-bit and 128-bit),
+* support for single-precision, double-precision, extended-precision (80-bit and 128-bit),
   and arbitrary-precision floating-point types,
 * the ability to maintain machine precision accuracy over
   tens of billions of timesteps,
@@ -110,7 +110,7 @@ license. The authors are Francesco Biscani and
 Dario Izzo (European Space Agency).
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
    install.rst
    basic_tutorials.rst
diff --git a/doc/install.rst b/doc/install.rst
index d69f1ac7c..038480c54 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -64,7 +64,8 @@ installing from source (the minimum required version is 3.18).
 Support for extended precision
 ``````````````````````````````
 
-Whereas in heyoka double-precision computations are always supported, support for extended-precision
+Whereas in heyoka single-precision and double-precision computations are always supported via the
+``float`` and ``double`` types respectively, support for extended-precision
 computations varies depending on the software/hardware platform.
 
 80-bit precision
@@ -81,7 +82,7 @@ heyoka (and all its dependencies) have been compiled with a compiler supporting
 ^^^^^^^^^^^^^^^^^
 
 On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit ARM),
-quadruple-precision integrations are always supported. Otherwise,
+quadruple-precision integrations are always supported via ``long double``. Otherwise,
 on platforms such as x86-64, quadruple-precision computations are supported if:
 
 * the nonstandard ``__float128`` floating-point type is
diff --git a/doc/tut_extended_precision.rst b/doc/tut_extended_precision.rst
index 4cabc0bb3..00e0cb77c 100644
--- a/doc/tut_extended_precision.rst
+++ b/doc/tut_extended_precision.rst
@@ -4,7 +4,7 @@ Computations in extended precision
 ==================================
 
 As hinted in the :ref:`installation instructions <ep_support>`, heyoka supports computations
-not only in double precision, but also in extended precision. Specifically, heyoka currently supports:
+not only in single and double precision, but also in extended precision. Specifically, heyoka currently supports:
 
 * the 80-bit IEEE `extended-precision format <https://en.wikipedia.org/wiki/Extended_precision>`__ (~21 decimal digits),
 * the 128-bit IEEE `quadruple-precision format <https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format>`__ (~36 decimal digits).
diff --git a/include/heyoka/detail/event_detection.hpp b/include/heyoka/detail/event_detection.hpp
index b93845b89..0f7fe2705 100644
--- a/include/heyoka/detail/event_detection.hpp
+++ b/include/heyoka/detail/event_detection.hpp
@@ -41,6 +41,9 @@ inline T taylor_deduce_cooldown(T, T)
     static_assert(always_false_v<T>, "Unhandled type");
 }
 
+template <>
+float taylor_deduce_cooldown(float, float);
+
 template <>
 double taylor_deduce_cooldown(double, double);
 
diff --git a/include/heyoka/detail/type_traits.hpp b/include/heyoka/detail/type_traits.hpp
index 62bb792e8..34848ecd7 100644
--- a/include/heyoka/detail/type_traits.hpp
+++ b/include/heyoka/detail/type_traits.hpp
@@ -76,6 +76,10 @@ template <typename>
 struct is_supported_fp : std::false_type {
 };
 
+template <>
+struct is_supported_fp<float> : std::true_type {
+};
+
 template <>
 struct is_supported_fp<double> : std::true_type {
 };
diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index f7113e225..704dbffc8 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -81,6 +81,7 @@ class HEYOKA_DLL_PUBLIC expression
 public:
     expression();
 
+    explicit expression(float);
     explicit expression(double);
     explicit expression(long double);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -125,6 +126,9 @@ HEYOKA_DLL_PUBLIC bool is_fixed(const expression &);
 inline namespace literals
 {
 
+HEYOKA_DLL_PUBLIC expression operator""_flt(long double);
+HEYOKA_DLL_PUBLIC expression operator""_flt(unsigned long long);
+
 HEYOKA_DLL_PUBLIC expression operator""_dbl(long double);
 HEYOKA_DLL_PUBLIC expression operator""_dbl(unsigned long long);
 
@@ -257,6 +261,7 @@ HEYOKA_DLL_PUBLIC expression operator+(expression);
 HEYOKA_DLL_PUBLIC expression operator-(const expression &);
 
 HEYOKA_DLL_PUBLIC expression operator+(const expression &, const expression &);
+HEYOKA_DLL_PUBLIC expression operator+(const expression &, float);
 HEYOKA_DLL_PUBLIC expression operator+(const expression &, double);
 HEYOKA_DLL_PUBLIC expression operator+(const expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -265,6 +270,7 @@ HEYOKA_DLL_PUBLIC expression operator+(const expression &, mppp::real128);
 #if defined(HEYOKA_HAVE_REAL)
 HEYOKA_DLL_PUBLIC expression operator+(const expression &, mppp::real);
 #endif
+HEYOKA_DLL_PUBLIC expression operator+(float, const expression &);
 HEYOKA_DLL_PUBLIC expression operator+(double, const expression &);
 HEYOKA_DLL_PUBLIC expression operator+(long double, const expression &);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -276,6 +282,7 @@ HEYOKA_DLL_PUBLIC expression operator+(mppp::real, const expression &);
 
 HEYOKA_DLL_PUBLIC expression operator-(const expression &, const expression &);
 HEYOKA_DLL_PUBLIC expression operator-(const expression &, double);
+HEYOKA_DLL_PUBLIC expression operator-(const expression &, float);
 HEYOKA_DLL_PUBLIC expression operator-(const expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
 HEYOKA_DLL_PUBLIC expression operator-(const expression &, mppp::real128);
@@ -283,6 +290,7 @@ HEYOKA_DLL_PUBLIC expression operator-(const expression &, mppp::real128);
 #if defined(HEYOKA_HAVE_REAL)
 HEYOKA_DLL_PUBLIC expression operator-(const expression &, mppp::real);
 #endif
+HEYOKA_DLL_PUBLIC expression operator-(float, const expression &);
 HEYOKA_DLL_PUBLIC expression operator-(double, const expression &);
 HEYOKA_DLL_PUBLIC expression operator-(long double, const expression &);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -294,6 +302,7 @@ HEYOKA_DLL_PUBLIC expression operator-(mppp::real, const expression &);
 
 HEYOKA_DLL_PUBLIC expression operator*(const expression &, const expression &);
 HEYOKA_DLL_PUBLIC expression operator*(const expression &, double);
+HEYOKA_DLL_PUBLIC expression operator*(const expression &, float);
 HEYOKA_DLL_PUBLIC expression operator*(const expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
 HEYOKA_DLL_PUBLIC expression operator*(const expression &, mppp::real128);
@@ -301,6 +310,7 @@ HEYOKA_DLL_PUBLIC expression operator*(const expression &, mppp::real128);
 #if defined(HEYOKA_HAVE_REAL)
 HEYOKA_DLL_PUBLIC expression operator*(const expression &, mppp::real);
 #endif
+HEYOKA_DLL_PUBLIC expression operator*(float, const expression &);
 HEYOKA_DLL_PUBLIC expression operator*(double, const expression &);
 HEYOKA_DLL_PUBLIC expression operator*(long double, const expression &);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -311,6 +321,7 @@ HEYOKA_DLL_PUBLIC expression operator*(mppp::real, const expression &);
 #endif
 
 HEYOKA_DLL_PUBLIC expression operator/(const expression &, const expression &);
+HEYOKA_DLL_PUBLIC expression operator/(const expression &, float);
 HEYOKA_DLL_PUBLIC expression operator/(const expression &, double);
 HEYOKA_DLL_PUBLIC expression operator/(const expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -319,6 +330,7 @@ HEYOKA_DLL_PUBLIC expression operator/(const expression &, mppp::real128);
 #if defined(HEYOKA_HAVE_REAL)
 HEYOKA_DLL_PUBLIC expression operator/(const expression &, mppp::real);
 #endif
+HEYOKA_DLL_PUBLIC expression operator/(float, const expression &);
 HEYOKA_DLL_PUBLIC expression operator/(double, const expression &);
 HEYOKA_DLL_PUBLIC expression operator/(long double, const expression &);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -329,6 +341,7 @@ HEYOKA_DLL_PUBLIC expression operator/(mppp::real, const expression &);
 #endif
 
 HEYOKA_DLL_PUBLIC expression &operator+=(expression &, const expression &);
+HEYOKA_DLL_PUBLIC expression &operator+=(expression &, float);
 HEYOKA_DLL_PUBLIC expression &operator+=(expression &, double);
 HEYOKA_DLL_PUBLIC expression &operator+=(expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -339,6 +352,7 @@ HEYOKA_DLL_PUBLIC expression &operator+=(expression &, mppp::real);
 #endif
 
 HEYOKA_DLL_PUBLIC expression &operator-=(expression &, const expression &);
+HEYOKA_DLL_PUBLIC expression &operator-=(expression &, float);
 HEYOKA_DLL_PUBLIC expression &operator-=(expression &, double);
 HEYOKA_DLL_PUBLIC expression &operator-=(expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -349,6 +363,7 @@ HEYOKA_DLL_PUBLIC expression &operator-=(expression &, mppp::real);
 #endif
 
 HEYOKA_DLL_PUBLIC expression &operator*=(expression &, const expression &);
+HEYOKA_DLL_PUBLIC expression &operator*=(expression &, float);
 HEYOKA_DLL_PUBLIC expression &operator*=(expression &, double);
 HEYOKA_DLL_PUBLIC expression &operator*=(expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
@@ -359,6 +374,7 @@ HEYOKA_DLL_PUBLIC expression &operator*=(expression &, mppp::real);
 #endif
 
 HEYOKA_DLL_PUBLIC expression &operator/=(expression &, const expression &);
+HEYOKA_DLL_PUBLIC expression &operator/=(expression &, float);
 HEYOKA_DLL_PUBLIC expression &operator/=(expression &, double);
 HEYOKA_DLL_PUBLIC expression &operator/=(expression &, long double);
 #if defined(HEYOKA_HAVE_REAL128)
diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index ad6e08901..ab399fabb 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -61,6 +61,7 @@ struct target_features {
     bool vsx = false;
     bool vsx3 = false;
     // Recommended SIMD sizes.
+    std::uint32_t simd_size_flt = 1;
     std::uint32_t simd_size_dbl = 1;
     std::uint32_t simd_size_ldbl = 1;
 #if defined(HEYOKA_HAVE_REAL128)
@@ -99,6 +100,9 @@ inline std::uint32_t recommended_simd_size()
     return 0;
 }
 
+template <>
+HEYOKA_DLL_PUBLIC std::uint32_t recommended_simd_size<float>();
+
 template <>
 HEYOKA_DLL_PUBLIC std::uint32_t recommended_simd_size<double>();
 
diff --git a/include/heyoka/math/atan2.hpp b/include/heyoka/math/atan2.hpp
index 48d427c2e..c741b2f6b 100644
--- a/include/heyoka/math/atan2.hpp
+++ b/include/heyoka/math/atan2.hpp
@@ -75,6 +75,7 @@ class HEYOKA_DLL_PUBLIC atan2_impl : public func_base
 
 HEYOKA_DLL_PUBLIC expression atan2(expression, expression);
 
+HEYOKA_DLL_PUBLIC expression atan2(expression, float);
 HEYOKA_DLL_PUBLIC expression atan2(expression, double);
 HEYOKA_DLL_PUBLIC expression atan2(expression, long double);
 
@@ -90,6 +91,7 @@ HEYOKA_DLL_PUBLIC expression atan2(expression, mppp::real);
 
 #endif
 
+HEYOKA_DLL_PUBLIC expression atan2(float, expression);
 HEYOKA_DLL_PUBLIC expression atan2(double, expression);
 HEYOKA_DLL_PUBLIC expression atan2(long double, expression);
 
diff --git a/include/heyoka/math/kepDE.hpp b/include/heyoka/math/kepDE.hpp
index 4352cd52a..6a4686e3c 100644
--- a/include/heyoka/math/kepDE.hpp
+++ b/include/heyoka/math/kepDE.hpp
@@ -72,6 +72,7 @@ HEYOKA_DLL_PUBLIC expression kepDE(expression, expression, expression);
     HEYOKA_DLL_PUBLIC expression kepDE(expression, type, expression);                                                  \
     HEYOKA_DLL_PUBLIC expression kepDE(type, expression, expression)
 
+HEYOKA_DECLARE_KEPDE_OVERLOADS(float);
 HEYOKA_DECLARE_KEPDE_OVERLOADS(double);
 HEYOKA_DECLARE_KEPDE_OVERLOADS(long double);
 
diff --git a/include/heyoka/math/kepE.hpp b/include/heyoka/math/kepE.hpp
index c1e2df224..88aa983fe 100644
--- a/include/heyoka/math/kepE.hpp
+++ b/include/heyoka/math/kepE.hpp
@@ -77,6 +77,7 @@ HEYOKA_DLL_PUBLIC expression kepE(expression, expression);
     HEYOKA_DLL_PUBLIC expression kepE(expression, type);                                                               \
     HEYOKA_DLL_PUBLIC expression kepE(type, expression);
 
+HEYOKA_DECLARE_KEPE_OVERLOADS(float);
 HEYOKA_DECLARE_KEPE_OVERLOADS(double);
 HEYOKA_DECLARE_KEPE_OVERLOADS(long double);
 
diff --git a/include/heyoka/math/kepF.hpp b/include/heyoka/math/kepF.hpp
index b94787aeb..fb2e34387 100644
--- a/include/heyoka/math/kepF.hpp
+++ b/include/heyoka/math/kepF.hpp
@@ -81,6 +81,7 @@ HEYOKA_DLL_PUBLIC expression kepF(expression, expression, expression);
     HEYOKA_DLL_PUBLIC expression kepF(expression, type, expression);                                                   \
     HEYOKA_DLL_PUBLIC expression kepF(type, expression, expression)
 
+HEYOKA_DECLARE_KEPF_OVERLOADS(float);
 HEYOKA_DECLARE_KEPF_OVERLOADS(double);
 HEYOKA_DECLARE_KEPF_OVERLOADS(long double);
 
diff --git a/include/heyoka/math/pow.hpp b/include/heyoka/math/pow.hpp
index de52a3003..1e0b6d5ac 100644
--- a/include/heyoka/math/pow.hpp
+++ b/include/heyoka/math/pow.hpp
@@ -114,6 +114,7 @@ pow_eval_algo get_pow_eval_algo(const pow_impl &);
 } // namespace detail
 
 HEYOKA_DLL_PUBLIC expression pow(expression, expression);
+HEYOKA_DLL_PUBLIC expression pow(expression, float);
 HEYOKA_DLL_PUBLIC expression pow(expression, double);
 HEYOKA_DLL_PUBLIC expression pow(expression, long double);
 
diff --git a/include/heyoka/step_callback.hpp b/include/heyoka/step_callback.hpp
index f3465d555..46d58e8c7 100644
--- a/include/heyoka/step_callback.hpp
+++ b/include/heyoka/step_callback.hpp
@@ -121,7 +121,7 @@ struct HEYOKA_DLL_PUBLIC_INLINE_CLASS step_callback_inner final : step_callback_
     void serialize(Archive &ar, unsigned)
     {
         ar &boost::serialization::base_object<step_callback_inner_base<TA>>(*this);
-        ar &m_value;
+        ar & m_value;
     }
 };
 
@@ -135,7 +135,7 @@ class HEYOKA_DLL_PUBLIC step_callback_impl
     template <typename Archive>
     void serialize(Archive &ar, unsigned)
     {
-        ar &m_ptr;
+        ar & m_ptr;
     }
 
     // Meta-programming for the generic ctor.
@@ -226,6 +226,9 @@ using step_callback_batch = detail::step_callback_impl<taylor_adaptive_batch<T>>
 HEYOKA_END_NAMESPACE
 
 // Disable Boost.Serialization tracking for the implementation details of step_callback.
+BOOST_CLASS_TRACKING(heyoka::detail::step_callback_inner_base<heyoka::taylor_adaptive<float>>,
+                     boost::serialization::track_never)
+
 BOOST_CLASS_TRACKING(heyoka::detail::step_callback_inner_base<heyoka::taylor_adaptive<double>>,
                      boost::serialization::track_never)
 
@@ -246,6 +249,9 @@ BOOST_CLASS_TRACKING(heyoka::detail::step_callback_inner_base<heyoka::taylor_ada
 
 #endif
 
+BOOST_CLASS_TRACKING(heyoka::detail::step_callback_inner_base<heyoka::taylor_adaptive_batch<float>>,
+                     boost::serialization::track_never)
+
 BOOST_CLASS_TRACKING(heyoka::detail::step_callback_inner_base<heyoka::taylor_adaptive_batch<double>>,
                      boost::serialization::track_never)
 
diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp
index 374caa44b..0ccdb56d2 100644
--- a/include/heyoka/taylor.hpp
+++ b/include/heyoka/taylor.hpp
@@ -414,6 +414,11 @@ inline std::ostream &operator<<(std::ostream &os, const nt_event_impl<T, B> &)
     return os;
 }
 
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const nt_event_impl<float, false> &);
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const nt_event_impl<float, true> &);
+
 template <>
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const nt_event_impl<double, false> &);
 template <>
@@ -535,6 +540,11 @@ inline std::ostream &operator<<(std::ostream &os, const t_event_impl<T, B> &)
     return os;
 }
 
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const t_event_impl<float, false> &);
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const t_event_impl<float, true> &);
+
 template <>
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const t_event_impl<double, false> &);
 template <>
@@ -641,6 +651,9 @@ inline std::ostream &operator<<(std::ostream &os, const continuous_output<T> &)
     return os;
 }
 
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const continuous_output<float> &);
+
 template <>
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const continuous_output<double> &);
 
@@ -735,6 +748,9 @@ inline std::ostream &operator<<(std::ostream &os, const continuous_output_batch<
     return os;
 }
 
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const continuous_output_batch<float> &);
+
 template <>
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const continuous_output_batch<double> &);
 
@@ -1731,6 +1747,9 @@ inline std::ostream &operator<<(std::ostream &os, const taylor_adaptive<T> &)
     return os;
 }
 
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const taylor_adaptive<float> &);
+
 template <>
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const taylor_adaptive<double> &);
 
@@ -1759,6 +1778,9 @@ inline std::ostream &operator<<(std::ostream &os, const taylor_adaptive_batch<T>
     return os;
 }
 
+template <>
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const taylor_adaptive_batch<float> &);
+
 template <>
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const taylor_adaptive_batch<double> &);
 
@@ -1789,9 +1811,11 @@ inline constexpr int taylor_adaptive_batch_s11n_version = 1;
 HEYOKA_END_NAMESPACE
 
 // Set the Boost s11n class version for taylor_adaptive and taylor_adaptive_batch.
+BOOST_CLASS_VERSION(heyoka::taylor_adaptive<float>, heyoka::detail::taylor_adaptive_s11n_version);
 BOOST_CLASS_VERSION(heyoka::taylor_adaptive<double>, heyoka::detail::taylor_adaptive_s11n_version);
 BOOST_CLASS_VERSION(heyoka::taylor_adaptive<long double>, heyoka::detail::taylor_adaptive_s11n_version);
 
+BOOST_CLASS_VERSION(heyoka::taylor_adaptive_batch<float>, heyoka::detail::taylor_adaptive_batch_s11n_version);
 BOOST_CLASS_VERSION(heyoka::taylor_adaptive_batch<double>, heyoka::detail::taylor_adaptive_batch_s11n_version);
 BOOST_CLASS_VERSION(heyoka::taylor_adaptive_batch<long double>, heyoka::detail::taylor_adaptive_batch_s11n_version);
 
diff --git a/src/detail/event_detection.cpp b/src/detail/event_detection.cpp
index 9788553a3..39728d754 100644
--- a/src/detail/event_detection.cpp
+++ b/src/detail/event_detection.cpp
@@ -19,7 +19,6 @@
 #include <stdexcept>
 #include <tuple>
 #include <type_traits>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -195,12 +194,12 @@ auto poly_eval_1(InputIt a, T x, std::uint32_t n)
     assert(n >= 2u); // LCOV_EXCL_LINE
 
     // Init the return value.
-    auto ret1 = a[n] * n;
+    auto ret1 = a[n] * static_cast<T>(n);
 
     for (std::uint32_t i = 1; i < n; ++i) {
         // NOTE: possible optimisation for mppp::real here:
         // use fmma() directly, once exposed in mp++.
-        ret1 = a[n - i] * (n - i) + std::move(ret1) * x;
+        ret1 = a[n - i] * static_cast<T>(n - i) + std::move(ret1) * x;
     }
 
     return ret1;
@@ -495,6 +494,12 @@ T taylor_deduce_cooldown_impl(T g_eps, T abs_der)
 
 } // namespace
 
+template <>
+float taylor_deduce_cooldown(float g_eps, float abs_der)
+{
+    return taylor_deduce_cooldown_impl(g_eps, abs_der);
+}
+
 template <>
 double taylor_deduce_cooldown(double g_eps, double abs_der)
 {
@@ -1051,8 +1056,7 @@ void taylor_adaptive<T>::ed_data::detect_events(const T &h, std::uint32_t order,
             // detection altogether without a warning. This is ok,
             // and non-finite Taylor coefficients will be caught in the
             // step() implementations anyway.
-            // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-            std::uint32_t fex_check_result;
+            std::uint32_t fex_check_result{};
             m_fex_check(ptr, &h, &back_int, &fex_check_result);
             if (fex_check_result) {
                 continue;
@@ -1295,8 +1299,7 @@ void taylor_adaptive<T>::ed_data::detect_events(const T &h, std::uint32_t order,
 
                 // Reverse tmp into tmp1, translate tmp1 by 1 with output
                 // in tmp2, and count the sign changes in tmp2.
-                // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-                std::uint32_t n_sc;
+                std::uint32_t n_sc{};
                 m_rtscc(tmp1.v.data(), tmp2.v.data(), &n_sc, tmp.v.data());
 
                 if (n_sc == 1u) {
@@ -1436,6 +1439,7 @@ void taylor_adaptive<T>::ed_data::detect_events(const T &h, std::uint32_t order,
 
 // Instantiate the book-keeping structure for event detection
 // in the scalar integrator.
+template struct taylor_adaptive<float>::ed_data;
 template struct taylor_adaptive<double>::ed_data;
 template struct taylor_adaptive<long double>::ed_data;
 
@@ -1985,8 +1989,7 @@ void taylor_adaptive_batch<T>::ed_data::detect_events(const T *h_ptr, std::uint3
 
                     // Reverse tmp into tmp1, translate tmp1 by 1 with output
                     // in tmp2, and count the sign changes in tmp2.
-                    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-                    std::uint32_t n_sc;
+                    std::uint32_t n_sc{};
                     m_rtscc(tmp1.v.data(), tmp2.v.data(), &n_sc, tmp.v.data());
 
                     if (n_sc == 1u) {
@@ -2137,6 +2140,7 @@ void taylor_adaptive_batch<T>::ed_data::detect_events(const T *h_ptr, std::uint3
 
 // Instantiate the book-keeping structure for event detection
 // in the batch integrator.
+template struct taylor_adaptive_batch<float>::ed_data;
 template struct taylor_adaptive_batch<double>::ed_data;
 template struct taylor_adaptive_batch<long double>::ed_data;
 
diff --git a/src/detail/num_utils.cpp b/src/detail/num_utils.cpp
index bae928b11..0c876614a 100644
--- a/src/detail/num_utils.cpp
+++ b/src/detail/num_utils.cpp
@@ -46,6 +46,8 @@ T num_zero_like([[maybe_unused]] const T &x)
 #endif
 }
 
+template float num_zero_like(const float &);
+
 template double num_zero_like(const double &);
 
 template long double num_zero_like(const long double &);
@@ -76,6 +78,8 @@ T num_one_like([[maybe_unused]] const T &x)
 #endif
 }
 
+template float num_one_like(const float &);
+
 template double num_one_like(const double &);
 
 template long double num_one_like(const long double &);
@@ -106,6 +110,8 @@ T num_eps_like([[maybe_unused]] const T &x)
 #endif
 }
 
+template float num_eps_like(const float &);
+
 template double num_eps_like(const double &);
 
 template long double num_eps_like(const long double &);
@@ -136,6 +142,8 @@ T num_inf_like([[maybe_unused]] const T &x)
 #endif
 }
 
+template float num_inf_like(const float &);
+
 template double num_inf_like(const double &);
 
 template long double num_inf_like(const long double &);
diff --git a/src/detail/string_conv.cpp b/src/detail/string_conv.cpp
index 608da3936..0f5bdbe26 100644
--- a/src/detail/string_conv.cpp
+++ b/src/detail/string_conv.cpp
@@ -52,7 +52,7 @@ std::uint32_t uname_to_index(const std::string &s)
 template <typename T>
 std::string fp_to_string(const T &x)
 {
-    if constexpr (std::is_same_v<T, double> || std::is_same_v<T, long double>) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double> || std::is_same_v<T, long double>) {
         return fmt::format("{}", x);
 #if defined(HEYOKA_HAVE_REAL128)
     } else if constexpr (std::is_same_v<T, mppp::real128>) {
@@ -68,6 +68,7 @@ std::string fp_to_string(const T &x)
 }
 
 // Explicit instantiations.
+template HEYOKA_DLL_PUBLIC std::string fp_to_string<float>(const float &);
 template HEYOKA_DLL_PUBLIC std::string fp_to_string<double>(const double &);
 template HEYOKA_DLL_PUBLIC std::string fp_to_string<long double>(const long double &);
 
diff --git a/src/detail/vector_math.cpp b/src/detail/vector_math.cpp
index bbd1141fe..1f47450a1 100644
--- a/src/detail/vector_math.cpp
+++ b/src/detail/vector_math.cpp
@@ -10,6 +10,7 @@
 #include <cassert>
 #include <cstdint>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
@@ -49,9 +50,10 @@ auto make_vfinfo(const char *s_name, std::string v_name, std::uint32_t width, st
 #if defined(HEYOKA_WITH_SLEEF)
 
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-auto add_vfinfo_sleef(vf_map_t &retval, const char *scalar_name, const char *sleef_base_name, const char *sleef_tp,
+auto add_vfinfo_sleef(vf_map_t &retval, const char *scalar_name, const char *sleef_base_name, std::string_view sleef_tp,
                       std::uint32_t nargs = 1)
 {
+    assert(sleef_tp == "d" || sleef_tp == "f");
     assert(retval.find(scalar_name) == retval.end());
     assert(nargs > 0u);
 
@@ -62,22 +64,31 @@ auto add_vfinfo_sleef(vf_map_t &retval, const char *scalar_name, const char *sle
 
     const auto &features = get_target_features();
 
+    // NOTE: we need to select the SIMD width(s) based on the floating-point type (sleef_tp).
+    // All supported SIMD extensions start with a minimum width of 2 for double-precision
+    // and 4 for single-precision, possibly supporting larger widths. So we use these two
+    // values for the computation.
+    const std::uint32_t base_simd_width = sleef_tp == "d" ? 2 : 4;
+
     if (features.avx512f) {
         retval[scalar_name]
-            = {make_sleef_vfinfo(2, "avx2128"), make_sleef_vfinfo(4, "avx2"), make_sleef_vfinfo(8, "avx512f")};
+            = {make_sleef_vfinfo(base_simd_width, "avx2128"), make_sleef_vfinfo(base_simd_width * 2u, "avx2"),
+               make_sleef_vfinfo(base_simd_width * 4u, "avx512f")};
     } else if (features.avx2) {
-        retval[scalar_name] = {make_sleef_vfinfo(2, "avx2128"), make_sleef_vfinfo(4, "avx2")};
+        retval[scalar_name]
+            = {make_sleef_vfinfo(base_simd_width, "avx2128"), make_sleef_vfinfo(base_simd_width * 2u, "avx2")};
     } else if (features.avx) {
-        retval[scalar_name] = {make_sleef_vfinfo(2, "sse4"), make_sleef_vfinfo(4, "avx")};
+        retval[scalar_name]
+            = {make_sleef_vfinfo(base_simd_width, "sse4"), make_sleef_vfinfo(base_simd_width * 2u, "avx")};
     } else if (features.sse2) {
-        retval[scalar_name] = {make_sleef_vfinfo(2, "sse2")};
+        retval[scalar_name] = {make_sleef_vfinfo(base_simd_width, "sse2")};
     } else if (features.aarch64) {
-        retval[scalar_name] = {make_sleef_vfinfo(2, "advsimd")};
+        retval[scalar_name] = {make_sleef_vfinfo(base_simd_width, "advsimd")};
     } else if (features.vsx) {
         // NOTE: at this time the sleef conda package for PPC64 does not seem
         // to provide VSX3 functions. Thus, for now we use only the
         // VSX implementations.
-        retval[scalar_name] = {make_sleef_vfinfo(2, "vsx")};
+        retval[scalar_name] = {make_sleef_vfinfo(base_simd_width, "vsx")};
     }
 }
 
@@ -96,6 +107,25 @@ auto make_vf_map()
     // but I am not 100% sure for the other archs. Let's keep this in mind.
     // NOTE: the same holds for things like abs() and floor().
 
+    // Single-precision.
+    add_vfinfo_sleef(retval, "llvm.sin.f32", "sin", "f");
+    add_vfinfo_sleef(retval, "llvm.cos.f32", "cos", "f");
+    add_vfinfo_sleef(retval, "llvm.log.f32", "log", "f");
+    add_vfinfo_sleef(retval, "llvm.exp.f32", "exp", "f");
+    add_vfinfo_sleef(retval, "llvm.pow.f32", "pow", "f", 2);
+    add_vfinfo_sleef(retval, "sinhf", "sinh", "f");
+    add_vfinfo_sleef(retval, "coshf", "cosh", "f");
+    add_vfinfo_sleef(retval, "asinf", "asin", "f");
+    add_vfinfo_sleef(retval, "acosf", "acos", "f");
+    add_vfinfo_sleef(retval, "asinhf", "asinh", "f");
+    add_vfinfo_sleef(retval, "acoshf", "acosh", "f");
+    add_vfinfo_sleef(retval, "tanf", "tan", "f");
+    add_vfinfo_sleef(retval, "tanhf", "tanh", "f");
+    add_vfinfo_sleef(retval, "atanf", "atan", "f");
+    add_vfinfo_sleef(retval, "atanhf", "atanh", "f");
+    add_vfinfo_sleef(retval, "atan2f", "atan2", "f", 2);
+    add_vfinfo_sleef(retval, "erff", "erf", "f");
+
     // Double-precision.
     add_vfinfo_sleef(retval, "llvm.sin.f64", "sin", "d");
     add_vfinfo_sleef(retval, "llvm.cos.f64", "cos", "d");
diff --git a/src/ensemble_propagate.cpp b/src/ensemble_propagate.cpp
index 6497ddfe9..8dffc1d02 100644
--- a/src/ensemble_propagate.cpp
+++ b/src/ensemble_propagate.cpp
@@ -216,6 +216,7 @@ ensemble_propagate_grid_impl(const taylor_adaptive<T> &ta, std::vector<T> grid,
                                         std::size_t, T, step_callback<T> &);
 // NOLINTEND
 
+HEYOKA_ENSEMBLE_PROPAGATE_SCALAR_INST(float)
 HEYOKA_ENSEMBLE_PROPAGATE_SCALAR_INST(double)
 HEYOKA_ENSEMBLE_PROPAGATE_SCALAR_INST(long double)
 
@@ -406,6 +407,7 @@ std::vector<std::tuple<taylor_adaptive_batch<T>, std::vector<T>>> ensemble_propa
         const std::vector<T> &, step_callback_batch<T> &);
 // NOLINTEND
 
+HEYOKA_ENSEMBLE_PROPAGATE_BATCH_INST(float)
 HEYOKA_ENSEMBLE_PROPAGATE_BATCH_INST(double)
 HEYOKA_ENSEMBLE_PROPAGATE_BATCH_INST(long double)
 
diff --git a/src/expression_basic.cpp b/src/expression_basic.cpp
index 8192b5e30..8a6078010 100644
--- a/src/expression_basic.cpp
+++ b/src/expression_basic.cpp
@@ -66,6 +66,8 @@ HEYOKA_BEGIN_NAMESPACE
 
 expression::expression() : expression(number{0.}) {}
 
+expression::expression(float x) : expression(number{x}) {}
+
 expression::expression(double x) : expression(number{x}) {}
 
 expression::expression(long double x) : expression(number{x}) {}
@@ -227,6 +229,16 @@ std::vector<expression> copy(const std::vector<expression> &v_ex)
 inline namespace literals
 {
 
+expression operator""_flt(long double x)
+{
+    return expression{static_cast<float>(x)};
+}
+
+expression operator""_flt(unsigned long long n)
+{
+    return expression{static_cast<float>(n)};
+}
+
 expression operator""_dbl(long double x)
 {
     return expression{static_cast<double>(x)};
diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
index 1214a48c5..c0db04ed0 100644
--- a/src/expression_cfunc.cpp
+++ b/src/expression_cfunc.cpp
@@ -1749,6 +1749,14 @@ std::vector<expression> add_cfunc(llvm_state &s, const std::string &name, const
 }
 
 // Explicit instantiations.
+template HEYOKA_DLL_PUBLIC std::vector<expression> add_cfunc<float>(llvm_state &, const std::string &,
+                                                                    const std::vector<expression> &, std::uint32_t,
+                                                                    bool, bool, bool, long long);
+template HEYOKA_DLL_PUBLIC std::vector<expression> add_cfunc<float>(llvm_state &, const std::string &,
+                                                                    const std::vector<expression> &,
+                                                                    const std::vector<expression> &, std::uint32_t,
+                                                                    bool, bool, bool, long long);
+
 template HEYOKA_DLL_PUBLIC std::vector<expression> add_cfunc<double>(llvm_state &, const std::string &,
                                                                      const std::vector<expression> &, std::uint32_t,
                                                                      bool, bool, bool, long long);
diff --git a/src/expression_ops.cpp b/src/expression_ops.cpp
index c3c37fe73..714fef3d2 100644
--- a/src/expression_ops.cpp
+++ b/src/expression_ops.cpp
@@ -37,265 +37,136 @@ expression operator+(expression e)
     return e;
 }
 
+// NOTE: in these operators we check for number arguments
+// immediately, before forwarding to the underlying implementation.
+// We do this in order to avoid accidental promotions and incorrect
+// precision propagation due to the use of double-precision constants
+// in the implementations of the primitives.
 expression operator-(const expression &e)
 {
-    return prod({expression{number{-1.}}, e});
+    if (const auto *nptr = std::get_if<number>(&e.value())) {
+        return expression{-*nptr};
+    } else {
+        return prod({expression{number{-1.}}, e});
+    }
 }
 
 // NOLINTNEXTLINE(misc-no-recursion)
 expression operator+(const expression &e1, const expression &e2)
 {
-    return sum({e1, e2});
+    if (std::holds_alternative<number>(e1.value()) && std::holds_alternative<number>(e2.value())) {
+        return expression{std::get<number>(e1.value()) + std::get<number>(e2.value())};
+    } else {
+        return sum({e1, e2});
+    }
 }
 
 // NOLINTNEXTLINE(misc-no-recursion)
 expression operator-(const expression &e1, const expression &e2)
 {
-    return e1 + -e2;
+    if (std::holds_alternative<number>(e1.value()) && std::holds_alternative<number>(e2.value())) {
+        return expression{std::get<number>(e1.value()) - std::get<number>(e2.value())};
+    } else {
+        return e1 + -e2;
+    }
 }
 
 // NOLINTNEXTLINE(misc-no-recursion)
 expression operator*(const expression &e1, const expression &e2)
 {
-    return prod({e1, e2});
+    if (std::holds_alternative<number>(e1.value()) && std::holds_alternative<number>(e2.value())) {
+        return expression{std::get<number>(e1.value()) * std::get<number>(e2.value())};
+    } else {
+        return prod({e1, e2});
+    }
 }
 
 // NOLINTNEXTLINE(misc-no-recursion)
 expression operator/(const expression &e1, const expression &e2)
 {
     if (std::holds_alternative<number>(e1.value()) && std::holds_alternative<number>(e2.value())) {
-        // NOTE: if e1 and e2 are numbers, do immediately constant folding. Otherwise, constant folding
-        // is first done on pow(e2, -1_dbl) and then on the product, which leads to wrong precision
-        // propagation in case e1 and e2 have different precisions.
         return expression{std::get<number>(e1.value()) / std::get<number>(e2.value())};
     } else {
         return prod({e1, pow(e2, -1_dbl)});
     }
 }
 
-expression operator+(const expression &ex, double x)
-{
-    return ex + expression{x};
-}
-
-expression operator+(const expression &ex, long double x)
-{
-    return ex + expression{x};
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression operator+(const expression &ex, mppp::real128 x)
-{
-    return ex + expression{x};
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression operator+(const expression &ex, mppp::real x)
-{
-    return ex + expression{std::move(x)};
-}
-
-#endif
-
-expression operator+(double x, const expression &ex)
-{
-    return expression{x} + ex;
-}
-
-expression operator+(long double x, const expression &ex)
-{
-    return expression{x} + ex;
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression operator+(mppp::real128 x, const expression &ex)
-{
-    return expression{x} + ex;
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression operator+(mppp::real x, const expression &ex)
-{
-    return expression{std::move(x)} + ex;
-}
-
-#endif
-
-expression operator-(const expression &ex, double x)
-{
-    return ex - expression{x};
-}
-
-expression operator-(const expression &ex, long double x)
-{
-    return ex - expression{x};
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression operator-(const expression &ex, mppp::real128 x)
-{
-    return ex - expression{x};
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression operator-(const expression &ex, mppp::real x)
-{
-    return ex - expression{std::move(x)};
-}
-
-#endif
-
-expression operator-(double x, const expression &ex)
-{
-    return expression{x} - ex;
-}
-
-expression operator-(long double x, const expression &ex)
-{
-    return expression{x} - ex;
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression operator-(mppp::real128 x, const expression &ex)
-{
-    return expression{x} - ex;
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression operator-(mppp::real x, const expression &ex)
-{
-    return expression{std::move(x)} - ex;
-}
-
-#endif
-
-expression operator*(const expression &ex, double x)
-{
-    return ex * expression{x};
-}
-
-expression operator*(const expression &ex, long double x)
-{
-    return ex * expression{x};
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression operator*(const expression &ex, mppp::real128 x)
-{
-    return ex * expression{x};
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression operator*(const expression &ex, mppp::real x)
-{
-    return ex * expression{std::move(x)};
-}
-
-#endif
+#define HEYOKA_EX_BINARY_OP_R(op, type)                                                                                \
+    expression operator op(const expression &ex, type x)                                                               \
+    {                                                                                                                  \
+        return ex op expression{std::move(x)};                                                                         \
+    }
 
-expression operator*(double x, const expression &ex)
-{
-    return expression{x} * ex;
-}
+#define HEYOKA_EX_BINARY_OP_L(op, type)                                                                                \
+    expression operator op(type x, const expression &ex)                                                               \
+    {                                                                                                                  \
+        return expression{std::move(x)} op ex;                                                                         \
+    }
 
-expression operator*(long double x, const expression &ex)
-{
-    return expression{x} * ex;
-}
+HEYOKA_EX_BINARY_OP_R(+, float)
+HEYOKA_EX_BINARY_OP_R(+, double)
+HEYOKA_EX_BINARY_OP_R(+, long double)
+HEYOKA_EX_BINARY_OP_R(-, float)
+HEYOKA_EX_BINARY_OP_R(-, double)
+HEYOKA_EX_BINARY_OP_R(-, long double)
+HEYOKA_EX_BINARY_OP_R(*, float)
+HEYOKA_EX_BINARY_OP_R(*, double)
+HEYOKA_EX_BINARY_OP_R(*, long double)
+HEYOKA_EX_BINARY_OP_R(/, float)
+HEYOKA_EX_BINARY_OP_R(/, double)
+HEYOKA_EX_BINARY_OP_R(/, long double)
 
 #if defined(HEYOKA_HAVE_REAL128)
 
-expression operator*(mppp::real128 x, const expression &ex)
-{
-    return expression{x} * ex;
-}
+HEYOKA_EX_BINARY_OP_R(+, mppp::real128)
+HEYOKA_EX_BINARY_OP_R(-, mppp::real128)
+HEYOKA_EX_BINARY_OP_R(*, mppp::real128)
+HEYOKA_EX_BINARY_OP_R(/, mppp::real128)
 
 #endif
 
 #if defined(HEYOKA_HAVE_REAL)
 
-expression operator*(mppp::real x, const expression &ex)
-{
-    return expression{std::move(x)} * ex;
-}
+HEYOKA_EX_BINARY_OP_R(+, mppp::real)
+HEYOKA_EX_BINARY_OP_R(-, mppp::real)
+HEYOKA_EX_BINARY_OP_R(*, mppp::real)
+HEYOKA_EX_BINARY_OP_R(/, mppp::real)
 
 #endif
 
-expression operator/(const expression &ex, double x)
-{
-    return ex / expression{x};
-}
-
-expression operator/(const expression &ex, long double x)
-{
-    return ex / expression{x};
-}
+HEYOKA_EX_BINARY_OP_L(+, float)
+HEYOKA_EX_BINARY_OP_L(+, double)
+HEYOKA_EX_BINARY_OP_L(+, long double)
+HEYOKA_EX_BINARY_OP_L(-, float)
+HEYOKA_EX_BINARY_OP_L(-, double)
+HEYOKA_EX_BINARY_OP_L(-, long double)
+HEYOKA_EX_BINARY_OP_L(*, float)
+HEYOKA_EX_BINARY_OP_L(*, double)
+HEYOKA_EX_BINARY_OP_L(*, long double)
+HEYOKA_EX_BINARY_OP_L(/, float)
+HEYOKA_EX_BINARY_OP_L(/, double)
+HEYOKA_EX_BINARY_OP_L(/, long double)
 
 #if defined(HEYOKA_HAVE_REAL128)
 
-expression operator/(const expression &ex, mppp::real128 x)
-{
-    return ex / expression{x};
-}
+HEYOKA_EX_BINARY_OP_L(+, mppp::real128)
+HEYOKA_EX_BINARY_OP_L(-, mppp::real128)
+HEYOKA_EX_BINARY_OP_L(*, mppp::real128)
+HEYOKA_EX_BINARY_OP_L(/, mppp::real128)
 
 #endif
 
 #if defined(HEYOKA_HAVE_REAL)
 
-expression operator/(const expression &ex, mppp::real x)
-{
-    return ex / expression{std::move(x)};
-}
-
-#endif
-
-expression operator/(double x, const expression &ex)
-{
-    return expression{x} / ex;
-}
-
-expression operator/(long double x, const expression &ex)
-{
-    return expression{x} / ex;
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression operator/(mppp::real128 x, const expression &ex)
-{
-    return expression{x} / ex;
-}
+HEYOKA_EX_BINARY_OP_L(+, mppp::real)
+HEYOKA_EX_BINARY_OP_L(-, mppp::real)
+HEYOKA_EX_BINARY_OP_L(*, mppp::real)
+HEYOKA_EX_BINARY_OP_L(/, mppp::real)
 
 #endif
 
-#if defined(HEYOKA_HAVE_REAL)
-
-expression operator/(mppp::real x, const expression &ex)
-{
-    return expression{std::move(x)} / ex;
-}
-
-#endif
+#undef HEYOKA_EX_BINARY_OP_R
+#undef HEYOKA_EX_BINARY_OP_L
 
 expression &operator+=(expression &x, const expression &e)
 {
@@ -322,117 +193,46 @@ expression &operator/=(expression &x, const expression &e)
     return x = x / e;
 }
 
-expression &operator+=(expression &ex, double x)
-{
-    return ex += expression{x};
-}
-
-expression &operator+=(expression &ex, long double x)
-{
-    return ex += expression{x};
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression &operator+=(expression &ex, mppp::real128 x)
-{
-    return ex += expression{x};
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression &operator+=(expression &ex, mppp::real x)
-{
-    return ex += expression{std::move(x)};
-}
-
-#endif
-
-expression &operator-=(expression &ex, double x)
-{
-    return ex -= expression{x};
-}
-
-expression &operator-=(expression &ex, long double x)
-{
-    return ex -= expression{x};
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression &operator-=(expression &ex, mppp::real128 x)
-{
-    return ex -= expression{x};
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression &operator-=(expression &ex, mppp::real x)
-{
-    return ex -= expression{std::move(x)};
-}
-
-#endif
-
-expression &operator*=(expression &ex, double x)
-{
-    return ex *= expression{x};
-}
-
-expression &operator*=(expression &ex, long double x)
-{
-    return ex *= expression{x};
-}
+// NOLINTBEGIN
+#define HEYOKA_EX_COMPOUND_OP(op, type)                                                                                \
+    expression &operator op(expression & ex, type x)                                                                   \
+    {                                                                                                                  \
+        return ex op expression{std::move(x)};                                                                         \
+    }
+// NOLINTEND
+
+HEYOKA_EX_COMPOUND_OP(+=, float)
+HEYOKA_EX_COMPOUND_OP(+=, double)
+HEYOKA_EX_COMPOUND_OP(+=, long double)
+HEYOKA_EX_COMPOUND_OP(-=, float)
+HEYOKA_EX_COMPOUND_OP(-=, double)
+HEYOKA_EX_COMPOUND_OP(-=, long double)
+HEYOKA_EX_COMPOUND_OP(*=, float)
+HEYOKA_EX_COMPOUND_OP(*=, double)
+HEYOKA_EX_COMPOUND_OP(*=, long double)
+HEYOKA_EX_COMPOUND_OP(/=, float)
+HEYOKA_EX_COMPOUND_OP(/=, double)
+HEYOKA_EX_COMPOUND_OP(/=, long double)
 
 #if defined(HEYOKA_HAVE_REAL128)
 
-expression &operator*=(expression &ex, mppp::real128 x)
-{
-    return ex *= expression{x};
-}
+HEYOKA_EX_COMPOUND_OP(+=, mppp::real128)
+HEYOKA_EX_COMPOUND_OP(-=, mppp::real128)
+HEYOKA_EX_COMPOUND_OP(*=, mppp::real128)
+HEYOKA_EX_COMPOUND_OP(/=, mppp::real128)
 
 #endif
 
 #if defined(HEYOKA_HAVE_REAL)
 
-expression &operator*=(expression &ex, mppp::real x)
-{
-    return ex *= expression{std::move(x)};
-}
+HEYOKA_EX_COMPOUND_OP(+=, mppp::real)
+HEYOKA_EX_COMPOUND_OP(-=, mppp::real)
+HEYOKA_EX_COMPOUND_OP(*=, mppp::real)
+HEYOKA_EX_COMPOUND_OP(/=, mppp::real)
 
 #endif
 
-expression &operator/=(expression &ex, double x)
-{
-    return ex /= expression{x};
-}
-
-expression &operator/=(expression &ex, long double x)
-{
-    return ex /= expression{x};
-}
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-expression &operator/=(expression &ex, mppp::real128 x)
-{
-    return ex /= expression{x};
-}
-
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-expression &operator/=(expression &ex, mppp::real x)
-{
-    return ex /= expression{std::move(x)};
-}
-
-#endif
+#undef HEYOKA_EX_COMPOUND_OP
 
 bool operator==(const expression &e1, const expression &e2)
 {
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index e48739be0..2d755158a 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -253,12 +253,14 @@ target_features get_target_features_impl()
     // Compute the recommended SIMD sizes.
     if (retval.avx512f || retval.avx2 || retval.avx) {
         // NOTE: keep the recommended SIMD size to
-        // 4 also for AVX512 due to perf issues in early
+        // 4/8 also for AVX512 due to perf issues in early
         // implementations. Revisit this in the future, possibly
         // making it conditional on the specific CPU model
         // in use.
+        retval.simd_size_flt = 8;
         retval.simd_size_dbl = 4;
     } else if (retval.sse2 || retval.aarch64 || retval.vsx || retval.vsx3) {
+        retval.simd_size_flt = 4;
         retval.simd_size_dbl = 2;
     }
 
@@ -298,6 +300,12 @@ const target_features &get_target_features()
 
 } // namespace detail
 
+template <>
+std::uint32_t recommended_simd_size<float>()
+{
+    return detail::get_target_features().simd_size_flt;
+}
+
 template <>
 std::uint32_t recommended_simd_size<double>()
 {
diff --git a/src/math/atan2.cpp b/src/math/atan2.cpp
index b9407d606..c2ef4a1a5 100644
--- a/src/math/atan2.cpp
+++ b/src/math/atan2.cpp
@@ -833,6 +833,11 @@ expression atan2(expression y, expression x)
     }
 }
 
+expression atan2(expression y, float x)
+{
+    return atan2(std::move(y), expression(x));
+}
+
 expression atan2(expression y, double x)
 {
     return atan2(std::move(y), expression(x));
@@ -861,6 +866,11 @@ expression atan2(expression y, mppp::real x)
 
 #endif
 
+expression atan2(float y, expression x)
+{
+    return atan2(expression(y), std::move(x));
+}
+
 expression atan2(double y, expression x)
 {
     return atan2(expression(y), std::move(x));
diff --git a/src/math/kepDE.cpp b/src/math/kepDE.cpp
index 49540c1bb..48769a751 100644
--- a/src/math/kepDE.cpp
+++ b/src/math/kepDE.cpp
@@ -186,6 +186,7 @@ expression kepDE(expression s0, expression c0, expression DM)
         return kepDE(expression{std::move(s0)}, std::move(c0), std::move(DM));                                         \
     }
 
+HEYOKA_DEFINE_KEPDE_OVERLOADS(float)
 HEYOKA_DEFINE_KEPDE_OVERLOADS(double)
 HEYOKA_DEFINE_KEPDE_OVERLOADS(long double)
 
diff --git a/src/math/kepE.cpp b/src/math/kepE.cpp
index 37cf42b27..41d6de353 100644
--- a/src/math/kepE.cpp
+++ b/src/math/kepE.cpp
@@ -868,6 +868,7 @@ expression kepE(expression e, expression M)
         return kepE(expression{std::move(e)}, std::move(M));                                                           \
     }
 
+HEYOKA_DEFINE_KEPE_OVERLOADS(float)
 HEYOKA_DEFINE_KEPE_OVERLOADS(double)
 HEYOKA_DEFINE_KEPE_OVERLOADS(long double)
 
diff --git a/src/math/kepF.cpp b/src/math/kepF.cpp
index b5f8ad225..47f0ce0c6 100644
--- a/src/math/kepF.cpp
+++ b/src/math/kepF.cpp
@@ -1785,6 +1785,7 @@ expression kepF(expression h, expression k, expression lam)
         return kepF(expression{std::move(h)}, std::move(k), std::move(lam));                                           \
     }
 
+HEYOKA_DEFINE_KEPF_OVERLOADS(float)
 HEYOKA_DEFINE_KEPF_OVERLOADS(double)
 HEYOKA_DEFINE_KEPF_OVERLOADS(long double)
 
diff --git a/src/math/pow.cpp b/src/math/pow.cpp
index f8b6706ba..a285a9e00 100644
--- a/src/math/pow.cpp
+++ b/src/math/pow.cpp
@@ -1180,6 +1180,11 @@ expression pow(expression b, expression e)
     return detail::pow_wrapper_impl(std::move(b), std::move(e));
 }
 
+expression pow(expression b, float e)
+{
+    return pow(std::move(b), expression{e});
+}
+
 expression pow(expression b, double e)
 {
     return pow(std::move(b), expression{e});
diff --git a/src/step_callback.cpp b/src/step_callback.cpp
index c0fad1462..b3a4a9f6e 100644
--- a/src/step_callback.cpp
+++ b/src/step_callback.cpp
@@ -110,6 +110,10 @@ void swap(step_callback_impl<TA> &a, step_callback_impl<TA> &b) noexcept
 }
 
 // Explicit instantiations.
+template class step_callback_impl<taylor_adaptive<float>>;
+template HEYOKA_DLL_PUBLIC void swap(step_callback_impl<taylor_adaptive<float>> &,
+                                     step_callback_impl<taylor_adaptive<float>> &);
+
 template class step_callback_impl<taylor_adaptive<double>>;
 template HEYOKA_DLL_PUBLIC void swap(step_callback_impl<taylor_adaptive<double>> &,
                                      step_callback_impl<taylor_adaptive<double>> &);
@@ -134,6 +138,10 @@ template HEYOKA_DLL_PUBLIC void swap(step_callback_impl<taylor_adaptive<mppp::re
 
 #endif
 
+template class step_callback_impl<taylor_adaptive_batch<float>>;
+template HEYOKA_DLL_PUBLIC void swap(step_callback_impl<taylor_adaptive_batch<float>> &,
+                                     step_callback_impl<taylor_adaptive_batch<float>> &);
+
 template class step_callback_impl<taylor_adaptive_batch<double>>;
 template HEYOKA_DLL_PUBLIC void swap(step_callback_impl<taylor_adaptive_batch<double>> &,
                                      step_callback_impl<taylor_adaptive_batch<double>> &);
diff --git a/src/taylor_00.cpp b/src/taylor_00.cpp
index c1f013f88..c99cb6ded 100644
--- a/src/taylor_00.cpp
+++ b/src/taylor_00.cpp
@@ -9,7 +9,6 @@
 #include <heyoka/config.hpp>
 
 #include <algorithm>
-#include <array>
 #include <cassert>
 #include <cmath>
 #include <cstddef>
@@ -2177,6 +2176,8 @@ const std::vector<T> &taylor_adaptive<T>::update_d_output(T time, bool rel_time)
 namespace detail
 {
 
+template class taylor_adaptive_base<float, taylor_adaptive<float>>;
+
 template class taylor_adaptive_base<double, taylor_adaptive<double>>;
 
 template class taylor_adaptive_base<long double, taylor_adaptive<long double>>;
@@ -2195,6 +2196,19 @@ template class taylor_adaptive_base<mppp::real, taylor_adaptive<mppp::real>>;
 
 } // namespace detail
 
+template class taylor_adaptive<float>;
+
+template HEYOKA_DLL_PUBLIC void
+taylor_adaptive<float>::finalise_ctor_impl(const std::vector<expression> &, std::vector<float>, std::optional<float>,
+                                           std::optional<float>, bool, bool, std::vector<float>, std::vector<t_event_t>,
+                                           std::vector<nt_event_t>, bool, std::optional<long long>);
+
+template HEYOKA_DLL_PUBLIC void
+taylor_adaptive<float>::finalise_ctor_impl(const std::vector<std::pair<expression, expression>> &, std::vector<float>,
+                                           std::optional<float>, std::optional<float>, bool, bool, std::vector<float>,
+                                           std::vector<t_event_t>, std::vector<nt_event_t>, bool,
+                                           std::optional<long long>);
+
 template class taylor_adaptive<double>;
 
 template HEYOKA_DLL_PUBLIC void taylor_adaptive<double>::finalise_ctor_impl(
@@ -4274,6 +4288,16 @@ void taylor_adaptive_batch<T>::reset_cooldowns(std::uint32_t i)
 }
 
 // Explicit instantiation of the batch implementation classes.
+template class taylor_adaptive_batch<float>;
+
+template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch<float>::finalise_ctor_impl(
+    const std::vector<expression> &, std::vector<float>, std::uint32_t, std::vector<float>, std::optional<float>, bool,
+    bool, std::vector<float>, std::vector<t_event_t>, std::vector<nt_event_t>, bool);
+
+template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch<float>::finalise_ctor_impl(
+    const std::vector<std::pair<expression, expression>> &, std::vector<float>, std::uint32_t, std::vector<float>,
+    std::optional<float>, bool, bool, std::vector<float>, std::vector<t_event_t>, std::vector<nt_event_t>, bool);
+
 template class taylor_adaptive_batch<double>;
 
 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch<double>::finalise_ctor_impl(
diff --git a/src/taylor_01.cpp b/src/taylor_01.cpp
index 6eb23ca0b..e957532f5 100644
--- a/src/taylor_01.cpp
+++ b/src/taylor_01.cpp
@@ -1348,6 +1348,12 @@ std::ostream &taylor_adaptive_batch_stream_impl(std::ostream &os, const taylor_a
 
 } // namespace detail
 
+template <>
+std::ostream &operator<<(std::ostream &os, const taylor_adaptive<float> &ta)
+{
+    return detail::taylor_adaptive_stream_impl(os, ta);
+}
+
 template <>
 std::ostream &operator<<(std::ostream &os, const taylor_adaptive<double> &ta)
 {
@@ -1380,6 +1386,12 @@ std::ostream &operator<<(std::ostream &os, const taylor_adaptive<mppp::real> &ta
 
 #endif
 
+template <>
+std::ostream &operator<<(std::ostream &os, const taylor_adaptive_batch<float> &ta)
+{
+    return detail::taylor_adaptive_batch_stream_impl(os, ta);
+}
+
 template <>
 std::ostream &operator<<(std::ostream &os, const taylor_adaptive_batch<double> &ta)
 {
@@ -1623,6 +1635,18 @@ std::ostream &t_event_impl_stream_impl(std::ostream &os, const expression &eq, e
 
 } // namespace
 
+template <>
+std::ostream &operator<<(std::ostream &os, const nt_event_impl<float, false> &e)
+{
+    return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction());
+}
+
+template <>
+std::ostream &operator<<(std::ostream &os, const nt_event_impl<float, true> &e)
+{
+    return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction());
+}
+
 template <>
 std::ostream &operator<<(std::ostream &os, const nt_event_impl<double, false> &e)
 {
@@ -1673,6 +1697,18 @@ std::ostream &operator<<(std::ostream &os, const nt_event_impl<mppp::real, false
 
 #endif
 
+template <>
+std::ostream &operator<<(std::ostream &os, const t_event_impl<float, false> &e)
+{
+    return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown());
+}
+
+template <>
+std::ostream &operator<<(std::ostream &os, const t_event_impl<float, true> &e)
+{
+    return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown());
+}
+
 template <>
 std::ostream &operator<<(std::ostream &os, const t_event_impl<double, false> &e)
 {
@@ -1724,6 +1760,12 @@ std::ostream &operator<<(std::ostream &os, const t_event_impl<mppp::real, false>
 #endif
 
 // Explicit instantiation of the implementation classes/functions.
+template class nt_event_impl<float, false>;
+template class t_event_impl<float, false>;
+
+template class nt_event_impl<float, true>;
+template class t_event_impl<float, true>;
+
 template class nt_event_impl<double, false>;
 template class t_event_impl<double, false>;
 
@@ -2350,6 +2392,7 @@ std::size_t continuous_output<T>::get_n_steps() const
 }
 
 // Explicit instantiations.
+template class continuous_output<float>;
 template class continuous_output<double>;
 template class continuous_output<long double>;
 
@@ -2397,6 +2440,12 @@ std::ostream &c_out_stream_impl(std::ostream &os, const continuous_output<T> &co
 
 } // namespace detail
 
+template <>
+std::ostream &operator<<(std::ostream &os, const continuous_output<float> &co)
+{
+    return detail::c_out_stream_impl(os, co);
+}
+
 template <>
 std::ostream &operator<<(std::ostream &os, const continuous_output<double> &co)
 {
@@ -3096,6 +3145,7 @@ std::size_t continuous_output_batch<T>::get_n_steps() const
 }
 
 // Explicit instantiations.
+template class continuous_output_batch<float>;
 template class continuous_output_batch<double>;
 template class continuous_output_batch<long double>;
 
@@ -3160,6 +3210,12 @@ std::ostream &c_out_batch_stream_impl(std::ostream &os, const continuous_output_
 
 } // namespace detail
 
+template <>
+std::ostream &operator<<(std::ostream &os, const continuous_output_batch<float> &co)
+{
+    return detail::c_out_batch_stream_impl(os, co);
+}
+
 template <>
 std::ostream &operator<<(std::ostream &os, const continuous_output_batch<double> &co)
 {
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index b2b7df9e4..9f01c4b34 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -2053,6 +2053,16 @@ taylor_dc_t taylor_add_jet(llvm_state &s, const std::string &name,
 }
 
 // Explicit instantiations.
+template HEYOKA_DLL_PUBLIC taylor_dc_t taylor_add_jet<float>(llvm_state &, const std::string &,
+                                                             const std::vector<expression> &, std::uint32_t,
+                                                             std::uint32_t, bool, bool, const std::vector<expression> &,
+                                                             bool, long long);
+
+template HEYOKA_DLL_PUBLIC taylor_dc_t taylor_add_jet<float>(llvm_state &, const std::string &,
+                                                             const std::vector<std::pair<expression, expression>> &,
+                                                             std::uint32_t, std::uint32_t, bool, bool,
+                                                             const std::vector<expression> &, bool, long long);
+
 template HEYOKA_DLL_PUBLIC taylor_dc_t taylor_add_jet<double>(llvm_state &, const std::string &,
                                                               const std::vector<expression> &, std::uint32_t,
                                                               std::uint32_t, bool, bool,
@@ -2121,6 +2131,8 @@ T taylor_default_max_delta_t()
 }
 
 // Explicit instantiations.
+template HEYOKA_DLL_PUBLIC float taylor_default_max_delta_t<float>();
+
 template HEYOKA_DLL_PUBLIC double taylor_default_max_delta_t<double>();
 
 template HEYOKA_DLL_PUBLIC long double taylor_default_max_delta_t<long double>();
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c63522e77..c3e4d0232 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -34,6 +34,7 @@ endfunction()
 ADD_HEYOKA_TESTCASE(readme_test)
 ADD_HEYOKA_TESTCASE(llvm_state)
 ADD_HEYOKA_TESTCASE(expression)
+ADD_HEYOKA_TESTCASE(expression_basic_api)
 ADD_HEYOKA_TESTCASE(expression_diff_tensors)
 ADD_HEYOKA_TESTCASE(expression_folding)
 ADD_HEYOKA_TESTCASE(expression_fix)
diff --git a/test/acos.cpp b/test/acos.cpp
index 3c997b7ca..241fd9167 100644
--- a/test/acos.cpp
+++ b/test/acos.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -226,8 +226,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(acos(x), {{x, .1_dbl}})) == acos(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -287,3 +287,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {acos(a), acos(b), acos(c), acos(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{.1f, .2f, .3f, .4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::acos(.1f)));
+    REQUIRE(outs[1] == approximately(std::acos(.2f)));
+    REQUIRE(outs[2] == approximately(std::acos(.3f)));
+    REQUIRE(outs[3] == approximately(std::acos(.4f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/acosh.cpp b/test/acosh.cpp
index 0d6f53d4e..d70ec1bb8 100644
--- a/test/acosh.cpp
+++ b/test/acosh.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -226,8 +226,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(acosh(x), {{x, 1.5_dbl}})) == acosh(1.5_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -287,3 +287,67 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {acosh(a), acosh(b), acosh(c), acosh(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1.1f, 1.2f, 1.3f, 1.4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::acosh(1.1f)));
+    REQUIRE(outs[1] == approximately(std::acosh(1.2f)));
+    REQUIRE(outs[2] == approximately(std::acosh(1.3f)));
+    REQUIRE(outs[3] == approximately(std::acosh(1.4f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acoshf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/asin.cpp b/test/asin.cpp
index 805b1d2b5..713df6695 100644
--- a/test/asin.cpp
+++ b/test/asin.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -226,8 +226,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(asin(x), {{x, .1_dbl}})) == asin(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -287,3 +287,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {asin(a), asin(b), asin(c), asin(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{.1f, .2f, .3f, .4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::asin(.1f)));
+    REQUIRE(outs[1] == approximately(std::asin(.2f)));
+    REQUIRE(outs[2] == approximately(std::asin(.3f)));
+    REQUIRE(outs[3] == approximately(std::asin(.4f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/asinh.cpp b/test/asinh.cpp
index ec361e960..f44bfe307 100644
--- a/test/asinh.cpp
+++ b/test/asinh.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -226,8 +226,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(asinh(x), {{x, -.5_dbl}})) == asinh(-.5_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -287,3 +287,67 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {asinh(a), asinh(b), asinh(c), asinh(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1.1f, 1.2f, 1.3f, 1.4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::asinh(1.1f)));
+    REQUIRE(outs[1] == approximately(std::asinh(1.2f)));
+    REQUIRE(outs[2] == approximately(std::asinh(1.3f)));
+    REQUIRE(outs[3] == approximately(std::asinh(1.4f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinhf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/atan.cpp b/test/atan.cpp
index 391f62389..6d69c7b20 100644
--- a/test/atan.cpp
+++ b/test/atan.cpp
@@ -49,7 +49,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -220,8 +220,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(atan(x), {{x, .1_dbl}})) == atan(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -281,3 +281,67 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {atan(a), atan(b), atan(c), atan(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{.1f, .2f, .3f, .4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::atan(.1f)));
+    REQUIRE(outs[1] == approximately(std::atan(.2f)));
+    REQUIRE(outs[2] == approximately(std::atan(.3f)));
+    REQUIRE(outs[3] == approximately(std::atan(.4f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/atan2.cpp b/test/atan2.cpp
index bf512833a..e64618142 100644
--- a/test/atan2.cpp
+++ b/test/atan2.cpp
@@ -10,7 +10,6 @@
 
 #include <algorithm>
 #include <cmath>
-#include <cstdint>
 #include <initializer_list>
 #include <limits>
 #include <random>
@@ -62,7 +61,7 @@ using namespace mppp::literals;
 
 #endif
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -110,7 +109,11 @@ TEST_CASE("atan2 diff")
 
 TEST_CASE("atan2 overloads")
 {
-    auto k = atan2("x"_var, 1.1);
+    auto k = atan2("x"_var, 1.1f);
+    REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1f});
+
+    k = atan2("x"_var, 1.1);
     REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
     REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1});
 
@@ -130,6 +133,10 @@ TEST_CASE("atan2 overloads")
     REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1_r256});
 #endif
 
+    k = atan2(1.1f, "x"_var);
+    REQUIRE(std::get<func>(k.value()).args()[1] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[0].value()) == number{1.1f});
+
     k = atan2(1.1, "x"_var);
     REQUIRE(std::get<func>(k.value()).args()[1] == "x"_var);
     REQUIRE(std::get<number>(std::get<func>(k.value()).args()[0].value()) == number{1.1});
@@ -324,8 +331,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(atan2(x, y), {{x, .1_dbl}, {y, .2_dbl}})) == atan2(.1_dbl, .2_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -385,3 +392,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {atan2(a, .5f), atan2(b, .6f), atan2(c, .7f), atan2(d, .8f)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{.1f, .2f, .3f, .4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::atan2(.1f, .5f)));
+    REQUIRE(outs[1] == approximately(std::atan2(.2f, .6f)));
+    REQUIRE(outs[2] == approximately(std::atan2(.3f, .7f)));
+    REQUIRE(outs[3] == approximately(std::atan2(.4f, .8f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2f", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/atanh.cpp b/test/atanh.cpp
index 191582bfb..98efb30b0 100644
--- a/test/atanh.cpp
+++ b/test/atanh.cpp
@@ -49,7 +49,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -220,8 +220,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(atanh(x), {{x, .5_dbl}})) == atanh(.5_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -281,3 +281,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {atanh(a), atanh(b), atanh(c), atanh(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{.1f, .2f, .3f, .4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::atanh(.1f)));
+    REQUIRE(outs[1] == approximately(std::atanh(.2f)));
+    REQUIRE(outs[2] == approximately(std::atanh(.3f)));
+    REQUIRE(outs[3] == approximately(std::atanh(.4f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanhf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/c_output.cpp b/test/c_output.cpp
index b8554f0ce..c282a0da3 100644
--- a/test/c_output.cpp
+++ b/test/c_output.cpp
@@ -38,7 +38,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -122,10 +122,10 @@ TEST_CASE("scalar")
         REQUIRE(d_out->get_n_steps() > 0u);
 
         // Try slightly outside the bounds.
-        (*d_out)(-.01);
+        (*d_out)(fp_t(-.01));
         REQUIRE(d_out->get_output()[0] == approximately(sin(fp_t(-0.01))));
         REQUIRE(d_out->get_output()[1] == approximately(cos(fp_t(-0.01))));
-        (*d_out)(10.01);
+        (*d_out)(fp_t(10.01));
         REQUIRE(d_out->get_output()[0] == approximately(sin(fp_t(10.01))));
         REQUIRE(d_out->get_output()[1] == approximately(cos(fp_t(10.01))));
 
@@ -200,10 +200,10 @@ TEST_CASE("scalar")
         REQUIRE(d_out->get_n_steps() > 0u);
 
         // Try slightly outside the bounds.
-        (*d_out)(.01);
+        (*d_out)(fp_t(.01));
         REQUIRE(d_out->get_output()[0] == approximately(sin(fp_t(0.01))));
         REQUIRE(d_out->get_output()[1] == approximately(cos(fp_t(0.01))));
-        (*d_out)(-10.01);
+        (*d_out)(fp_t(-10.01));
         REQUIRE(d_out->get_output()[0] == approximately(sin(fp_t(-10.01))));
         REQUIRE(d_out->get_output()[1] == approximately(cos(fp_t(-10.01))));
 
@@ -345,7 +345,7 @@ TEST_CASE("batch")
         // The vector of final times.
         std::vector<fp_t> final_tm;
         for (auto i = 0u; i < batch_size; ++i) {
-            final_tm.push_back(10. + fp_t(i) / 100);
+            final_tm.push_back(fp_t(10.) + fp_t(i) / 100);
         }
 
         // Create a random batch grid.
@@ -357,7 +357,7 @@ TEST_CASE("batch")
 
             std::uniform_real_distribution<double> rdist(1e-6, 10. + i / 100. - 1e-6);
             for (auto j = 0u; j < n_points - 2u; ++j) {
-                tmp[j] = rdist(rng);
+                tmp[j] = static_cast<fp_t>(rdist(rng));
             }
             std::sort(tmp.begin(), tmp.end());
 
@@ -424,7 +424,7 @@ TEST_CASE("batch")
 
         // Try slightly outside the bounds.
         for (auto j = 0u; j < batch_size; ++j) {
-            loc_time[j] = -0.01;
+            loc_time[j] = fp_t(-0.01);
         }
         (*d_out)(loc_time);
         for (auto j = 0u; j < batch_size; ++j) {
@@ -434,7 +434,7 @@ TEST_CASE("batch")
                     == approximately(-ic[j] * sin(loc_time[j]) + ic[batch_size + j] * cos(loc_time[j])));
         }
         for (auto j = 0u; j < batch_size; ++j) {
-            loc_time[j] = final_tm[j] + 0.01;
+            loc_time[j] = final_tm[j] + fp_t(0.01);
         }
         (*d_out)(loc_time);
         for (auto j = 0u; j < batch_size; ++j) {
@@ -560,7 +560,7 @@ TEST_CASE("batch")
 
         // Try slightly outside the bounds.
         for (auto j = 0u; j < batch_size; ++j) {
-            loc_time[j] = 0.01;
+            loc_time[j] = fp_t(0.01);
         }
         (*d_out)(loc_time);
         for (auto j = 0u; j < batch_size; ++j) {
@@ -570,7 +570,7 @@ TEST_CASE("batch")
                     == approximately(-ic[j] * sin(loc_time[j]) + ic[batch_size + j] * cos(loc_time[j])));
         }
         for (auto j = 0u; j < batch_size; ++j) {
-            loc_time[j] = final_tm[j] - 0.01;
+            loc_time[j] = final_tm[j] - fp_t(0.01);
         }
         (*d_out)(loc_time);
         for (auto j = 0u; j < batch_size; ++j) {
diff --git a/test/constants.cpp b/test/constants.cpp
index c2aee8f14..4c8916d6c 100644
--- a/test/constants.cpp
+++ b/test/constants.cpp
@@ -50,7 +50,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/cos.cpp b/test/cos.cpp
index c0ccf0e0d..8846d465a 100644
--- a/test/cos.cpp
+++ b/test/cos.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -243,8 +243,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(cos(x), {{x, .1_dbl}})) == cos(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -307,3 +307,69 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {cos(a), cos(b), cos(c), cos(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::cos(1.f)));
+    REQUIRE(outs[1] == approximately(std::cos(2.f)));
+    REQUIRE(outs[2] == approximately(std::cos(3.f)));
+    REQUIRE(outs[3] == approximately(std::cos(4.f)));
+
+#if defined(HEYOKA_WITH_SLEEF)
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+#if LLVM_VERSION_MAJOR >= 16
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+#endif
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/cosh.cpp b/test/cosh.cpp
index 31b58c9c8..c6ca5c8b9 100644
--- a/test/cosh.cpp
+++ b/test/cosh.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -220,8 +220,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(cosh(x), {{x, 1.5_dbl}})) == cosh(1.5_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -281,3 +281,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {cosh(a), cosh(b), cosh(c), cosh(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::cosh(1.f)));
+    REQUIRE(outs[1] == approximately(std::cosh(2.f)));
+    REQUIRE(outs[2] == approximately(std::cosh(3.f)));
+    REQUIRE(outs[3] == approximately(std::cosh(4.f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@coshf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/dfloat_time.cpp b/test/dfloat_time.cpp
index 06f5ed57f..beff5ab4c 100644
--- a/test/dfloat_time.cpp
+++ b/test/dfloat_time.cpp
@@ -37,7 +37,7 @@ const int ntrials = 100;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -71,7 +71,7 @@ TEST_CASE("scalar test")
         std::uniform_real_distribution rdist(-1e-9, 1e-9);
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time(0);
             ta.get_state_data()[0] = 0;
@@ -93,7 +93,7 @@ TEST_CASE("scalar test")
         err = 0;
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time(0);
             ta.get_state_data()[0] = 0;
@@ -115,7 +115,7 @@ TEST_CASE("scalar test")
         err = 0;
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time(0);
             ta.get_state_data()[0] = 0;
@@ -139,7 +139,7 @@ TEST_CASE("scalar test")
         err = 0;
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time(0);
             ta.get_state_data()[0] = 0;
@@ -182,8 +182,8 @@ TEST_CASE("batch test")
         std::uniform_real_distribution rdist(-1e-9, 1e-9);
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
-            const auto v1 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
+            const auto v1 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time({fp_t(0), 0});
             ta.get_state_data()[0] = 0;
@@ -212,8 +212,8 @@ TEST_CASE("batch test")
         final_time = std::vector{fp_t(-10000.), fp_t(-11000.)};
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
-            const auto v1 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
+            const auto v1 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time({fp_t(0), 0});
             ta.get_state_data()[0] = 0;
@@ -240,8 +240,8 @@ TEST_CASE("batch test")
         err = 0;
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
-            const auto v1 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
+            const auto v1 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time({fp_t(0), 0});
             ta.get_state_data()[0] = 0;
@@ -270,8 +270,8 @@ TEST_CASE("batch test")
         err = 0;
 
         for (auto i = 0; i < ntrials; ++i) {
-            const auto v0 = fp_t(1) + rdist(rng);
-            const auto v1 = fp_t(1) + rdist(rng);
+            const auto v0 = fp_t(1) + static_cast<fp_t>(rdist(rng));
+            const auto v1 = fp_t(1) + static_cast<fp_t>(rdist(rng));
 
             ta.set_time({fp_t(0), 0});
             ta.get_state_data()[0] = 0;
diff --git a/test/div.cpp b/test/div.cpp
index 9334edea4..99180f668 100644
--- a/test/div.cpp
+++ b/test/div.cpp
@@ -46,7 +46,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/ensemble_propagate.cpp b/test/ensemble_propagate.cpp
index d17ccbd19..a45237591 100644
--- a/test/ensemble_propagate.cpp
+++ b/test/ensemble_propagate.cpp
@@ -31,7 +31,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(__FreeBSD__)
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
diff --git a/test/erf.cpp b/test/erf.cpp
index 1cfbd9aee..239aac9ed 100644
--- a/test/erf.cpp
+++ b/test/erf.cpp
@@ -52,7 +52,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -227,8 +227,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(erf(x), {{x, 1.5_dbl}})) == erf(1.5_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -288,3 +288,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {erf(a), erf(b), erf(c), erf(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{.1f, .2f, .3f, .4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::erf(.1f)));
+    REQUIRE(outs[1] == approximately(std::erf(.2f)));
+    REQUIRE(outs[2] == approximately(std::erf(.3f)));
+    REQUIRE(outs[3] == approximately(std::erf(.4f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erff", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/exp.cpp b/test/exp.cpp
index 1aec93baf..768234042 100644
--- a/test/exp.cpp
+++ b/test/exp.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -234,8 +234,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(exp(x), {{x, .1_dbl}})) == exp(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -298,3 +298,69 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {exp(a), exp(b), exp(c), exp(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::exp(1.f)));
+    REQUIRE(outs[1] == approximately(std::exp(2.f)));
+    REQUIRE(outs[2] == approximately(std::exp(3.f)));
+    REQUIRE(outs[3] == approximately(std::exp(4.f)));
+
+#if defined(HEYOKA_WITH_SLEEF)
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+#if LLVM_VERSION_MAJOR >= 16
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+#endif
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/expression_basic_api.cpp b/test/expression_basic_api.cpp
new file mode 100644
index 000000000..4fc75f88b
--- /dev/null
+++ b/test/expression_basic_api.cpp
@@ -0,0 +1,331 @@
+// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <heyoka/config.hpp>
+
+#include <variant>
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+#include <mp++/real128.hpp>
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+#include <mp++/real.hpp>
+
+#endif
+
+#include <heyoka/expression.hpp>
+#include <heyoka/number.hpp>
+
+#include "catch.hpp"
+
+using namespace heyoka;
+
+TEST_CASE("number ctors")
+{
+    REQUIRE(std::get<number>(expression{1.1f}.value()) == number{1.1f});
+    REQUIRE(std::get<number>(expression{1.1}.value()) == number{1.1});
+    REQUIRE(std::get<number>(expression{1.1l}.value()) == number{1.1l});
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    REQUIRE(std::get<number>(expression{mppp::real128{"1.1"}}.value()) == number{mppp::real128{"1.1"}});
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    REQUIRE(std::get<number>(expression{mppp::real{"1.1", 123}}.value()) == number{mppp::real{"1.1", 123}});
+
+#endif
+}
+
+TEST_CASE("literals")
+{
+    REQUIRE(1.1_flt == expression{1.1f});
+    REQUIRE(1111111111111111111_flt == expression{1111111111111111111.f});
+
+    REQUIRE(1.1_dbl == expression{1.1});
+    REQUIRE(1111111111111111111_dbl == expression{1111111111111111111.});
+
+    REQUIRE(1.1_ldbl == expression{1.1l});
+    REQUIRE(1111111111111111111_ldbl == expression{1111111111111111111.l});
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    using namespace mppp::literals;
+
+    REQUIRE(1.1_f128 == expression{1.1_rq});
+    REQUIRE(1111111111111111111_f128 == expression{1111111111111111111._rq});
+
+#endif
+}
+
+TEST_CASE("number binary ops")
+{
+
+    REQUIRE(1_flt + 1.1f == expression{1.f + 1.1f});
+    REQUIRE(1.1f + 1_flt == expression{1.f + 1.1f});
+
+    REQUIRE(1_dbl + 1.1 == expression{1. + 1.1});
+    REQUIRE(1.1 + 1_dbl == expression{1. + 1.1});
+
+    REQUIRE(1_ldbl + 1.1l == expression{1.l + 1.1l});
+    REQUIRE(1.1l + 1_ldbl == expression{1.l + 1.1l});
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    using namespace mppp::literals;
+
+    REQUIRE(1_f128 + 1.1_rq == expression{1._rq + 1.1_rq});
+    REQUIRE(1.1_rq + 1_f128 == expression{1._rq + 1.1_rq});
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    REQUIRE(1_dbl + mppp::real{"1.1", 345} == expression{1. + mppp::real{"1.1", 345}});
+    REQUIRE(mppp::real{"1.1", 345} + 1_dbl == expression{1. + mppp::real{"1.1", 345}});
+
+#endif
+
+    REQUIRE(1_flt - 1.1f == expression{1.f - 1.1f});
+    REQUIRE(1.1f - 1_flt == expression{1.1f - 1.f});
+
+    REQUIRE(1_dbl - 1.1 == expression{1. - 1.1});
+    REQUIRE(1.1 - 1_dbl == expression{1.1 - 1.});
+
+    REQUIRE(1_ldbl - 1.1l == expression{1.l - 1.1l});
+    REQUIRE(1.1l - 1_ldbl == expression{1.1l - 1.l});
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    REQUIRE(1_f128 - 1.1_rq == expression{1._rq - 1.1_rq});
+    REQUIRE(1.1_rq - 1_f128 == expression{1.1_rq - 1._rq});
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    REQUIRE(1_dbl - mppp::real{"1.1", 345} == expression{1. - mppp::real{"1.1", 345}});
+    REQUIRE(mppp::real{"1.1", 345} - 1_dbl == expression{mppp::real{"1.1", 345} - 1.});
+
+#endif
+
+    REQUIRE(1_flt * 1.1f == expression{1.f * 1.1f});
+    REQUIRE(1.1f * 1_flt == expression{1.1f * 1.f});
+
+    REQUIRE(1_dbl * 1.1 == expression{1. * 1.1});
+    REQUIRE(1.1 * 1_dbl == expression{1.1 * 1.});
+
+    REQUIRE(1_ldbl * 1.1l == expression{1.l * 1.1l});
+    REQUIRE(1.1l * 1_ldbl == expression{1.1l * 1.l});
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    REQUIRE(1_f128 * 1.1_rq == expression{1._rq * 1.1_rq});
+    REQUIRE(1.1_rq * 1_f128 == expression{1.1_rq * 1._rq});
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    REQUIRE(1_dbl * mppp::real{"1.1", 345} == expression{1. * mppp::real{"1.1", 345}});
+    REQUIRE(mppp::real{"1.1", 345} * 1_dbl == expression{mppp::real{"1.1", 345} * 1.});
+
+#endif
+
+    REQUIRE(1_flt / 1.1f == expression{1.f / 1.1f});
+    REQUIRE(1.1f / 1_flt == expression{1.1f / 1.f});
+
+    REQUIRE(1_dbl / 1.1 == expression{1. / 1.1});
+    REQUIRE(1.1 / 1_dbl == expression{1.1 / 1.});
+
+    REQUIRE(1_ldbl / 1.1l == expression{1.l / 1.1l});
+    REQUIRE(1.1l / 1_ldbl == expression{1.1l / 1.l});
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    REQUIRE(1_f128 / 1.1_rq == expression{1._rq / 1.1_rq});
+    REQUIRE(1.1_rq / 1_f128 == expression{1.1_rq / 1._rq});
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    REQUIRE(1_dbl / mppp::real{"1.1", 345} == expression{1. / mppp::real{"1.1", 345}});
+    REQUIRE(mppp::real{"1.1", 345} / 1_dbl == expression{mppp::real{"1.1", 345} / 1.});
+
+#endif
+}
+
+TEST_CASE("number compound ops")
+{
+    {
+        auto ex = 1_flt;
+        ex += 1.1f;
+        REQUIRE(ex == 1_flt + 1.1f);
+    }
+
+    {
+        auto ex = 1_dbl;
+        ex += 1.1;
+        REQUIRE(ex == 1_dbl + 1.1);
+    }
+
+    {
+        auto ex = 1_ldbl;
+        ex += 1.1l;
+        REQUIRE(ex == 1_ldbl + 1.1l);
+    }
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    {
+        using namespace mppp::literals;
+
+        auto ex = 1_f128;
+        ex += 1.1_rq;
+        REQUIRE(ex == 1_f128 + 1.1_rq);
+    }
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    {
+        auto ex = 1_dbl;
+        ex += mppp::real{"1.1", 345};
+        REQUIRE(ex == 1_dbl + mppp::real{"1.1", 345});
+    }
+
+#endif
+
+    {
+        auto ex = 1_flt;
+        ex -= 1.1f;
+        REQUIRE(ex == 1_flt - 1.1f);
+    }
+
+    {
+        auto ex = 1_dbl;
+        ex -= 1.1;
+        REQUIRE(ex == 1_dbl - 1.1);
+    }
+
+    {
+        auto ex = 1_ldbl;
+        ex -= 1.1l;
+        REQUIRE(ex == 1_ldbl - 1.1l);
+    }
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    {
+        using namespace mppp::literals;
+
+        auto ex = 1_f128;
+        ex -= 1.1_rq;
+        REQUIRE(ex == 1_f128 - 1.1_rq);
+    }
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    {
+        auto ex = 1_dbl;
+        ex -= mppp::real{"1.1", 345};
+        REQUIRE(ex == 1_dbl - mppp::real{"1.1", 345});
+    }
+
+#endif
+
+    {
+        auto ex = 1_flt;
+        ex *= 1.1f;
+        REQUIRE(ex == 1_flt * 1.1f);
+    }
+
+    {
+        auto ex = 1_dbl;
+        ex *= 1.1;
+        REQUIRE(ex == 1_dbl * 1.1);
+    }
+
+    {
+        auto ex = 1_ldbl;
+        ex *= 1.1l;
+        REQUIRE(ex == 1_ldbl * 1.1l);
+    }
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    {
+        using namespace mppp::literals;
+
+        auto ex = 1_f128;
+        ex *= 1.1_rq;
+        REQUIRE(ex == 1_f128 * 1.1_rq);
+    }
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    {
+        auto ex = 1_dbl;
+        ex *= mppp::real{"1.1", 345};
+        REQUIRE(ex == 1_dbl * mppp::real{"1.1", 345});
+    }
+
+#endif
+
+    {
+        auto ex = 1_flt;
+        ex /= 1.1f;
+        REQUIRE(ex == 1_flt / 1.1f);
+    }
+
+    {
+        auto ex = 1_dbl;
+        ex /= 1.1;
+        REQUIRE(ex == 1_dbl / 1.1);
+    }
+
+    {
+        auto ex = 1_ldbl;
+        ex /= 1.1l;
+        REQUIRE(ex == 1_ldbl / 1.1l);
+    }
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+    {
+        using namespace mppp::literals;
+
+        auto ex = 1_f128;
+        ex /= 1.1_rq;
+        REQUIRE(ex == 1_f128 / 1.1_rq);
+    }
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    {
+        auto ex = 1_dbl;
+        ex /= mppp::real{"1.1", 345};
+        REQUIRE(ex == 1_dbl / mppp::real{"1.1", 345});
+    }
+
+#endif
+}
diff --git a/test/kepDE.cpp b/test/kepDE.cpp
index 5ba8f0d17..b1b8d5b13 100644
--- a/test/kepDE.cpp
+++ b/test/kepDE.cpp
@@ -48,7 +48,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -233,6 +233,7 @@ TEST_CASE("kepDE diff")
 
 TEST_CASE("kepDE overloads")
 {
+    HEYOKA_TEST_KEPDE_OVERLOAD(float);
     HEYOKA_TEST_KEPDE_OVERLOAD(double);
     HEYOKA_TEST_KEPDE_OVERLOAD(long double);
 
@@ -279,6 +280,8 @@ TEST_CASE("kepDE s11n")
 TEST_CASE("cfunc")
 {
     using std::isnan;
+    using std::nextafter;
+    using std::sqrt;
 
     auto tester = [](auto fp_x, unsigned opt_level, bool high_accuracy, bool compact_mode) {
         using fp_t = decltype(fp_x);
@@ -294,11 +297,12 @@ TEST_CASE("cfunc")
 
         auto generate_hk = [&h_dist]() {
             // Generate h.
-            auto h_val = h_dist(rng);
+            auto h_val = static_cast<fp_t>(h_dist(rng));
 
             // Generate a k such that h**2+k**2<1.
-            const auto max_abs_k = std::sqrt(1. - h_val * h_val);
-            std::uniform_real_distribution<double> k_dist(std::nextafter(-max_abs_k, 0.), max_abs_k);
+            const auto max_abs_k = sqrt(fp_t(1) - h_val * h_val);
+            std::uniform_real_distribution<double> k_dist(static_cast<double>(nextafter(-max_abs_k, fp_t(0))),
+                                                          static_cast<double>(max_abs_k));
             auto k_val = static_cast<fp_t>(k_dist(rng));
 
             return std::make_pair(static_cast<fp_t>(h_val), std::move(k_val));
@@ -335,7 +339,7 @@ TEST_CASE("cfunc")
                     // Generate the hs and ks.
                     auto [hval, kval] = generate_hk();
                     // Generate the lam.
-                    auto lamval = lam_dist(rng);
+                    auto lamval = static_cast<fp_t>(lam_dist(rng));
 
                     ins[i] = hval;
                     ins[i + batch_size] = kval;
@@ -359,8 +363,8 @@ TEST_CASE("cfunc")
                     auto hval = ins[i];
                     auto kval = ins[i + batch_size];
                     auto lamval = ins[i + 2u * batch_size];
-                    REQUIRE(eps_close(cos(lamval), cos(Fval + hval * (1. - cos(Fval)) - kval * sin(Fval))));
-                    REQUIRE(eps_close(sin(lamval), sin(Fval + hval * (1. - cos(Fval)) - kval * sin(Fval))));
+                    REQUIRE(eps_close(cos(lamval), cos(Fval + hval * (fp_t(1) - cos(Fval)) - kval * sin(Fval))));
+                    REQUIRE(eps_close(sin(lamval), sin(Fval + hval * (fp_t(1) - cos(Fval)) - kval * sin(Fval))));
 
                     // Second output.
                     REQUIRE(!isnan(outs[i + batch_size]));
@@ -368,17 +372,17 @@ TEST_CASE("cfunc")
                     hval = pars[i];
                     kval = pars[i + batch_size];
                     lamval = ins[i + 2u * batch_size];
-                    REQUIRE(eps_close(cos(lamval), cos(Fval + hval * (1. - cos(Fval)) - kval * sin(Fval))));
-                    REQUIRE(eps_close(sin(lamval), sin(Fval + hval * (1. - cos(Fval)) - kval * sin(Fval))));
+                    REQUIRE(eps_close(cos(lamval), cos(Fval + hval * (fp_t(1) - cos(Fval)) - kval * sin(Fval))));
+                    REQUIRE(eps_close(sin(lamval), sin(Fval + hval * (fp_t(1) - cos(Fval)) - kval * sin(Fval))));
 
                     // Third output.
                     REQUIRE(!isnan(outs[i + batch_size * 2u]));
                     Fval = outs[i + batch_size * 2u];
-                    hval = .5;
-                    kval = .3;
+                    hval = fp_t(.5);
+                    kval = fp_t(.3);
                     lamval = ins[i + 2u * batch_size];
-                    REQUIRE(eps_close(cos(lamval), cos(Fval + hval * (1. - cos(Fval)) - kval * sin(Fval))));
-                    REQUIRE(eps_close(sin(lamval), sin(Fval + hval * (1. - cos(Fval)) - kval * sin(Fval))));
+                    REQUIRE(eps_close(cos(lamval), cos(Fval + hval * (fp_t(1) - cos(Fval)) - kval * sin(Fval))));
+                    REQUIRE(eps_close(sin(lamval), sin(Fval + hval * (fp_t(1) - cos(Fval)) - kval * sin(Fval))));
                 }
             }
         }
diff --git a/test/kepE.cpp b/test/kepE.cpp
index 0156a221c..e041668fc 100644
--- a/test/kepE.cpp
+++ b/test/kepE.cpp
@@ -67,7 +67,7 @@ using namespace mppp::literals;
 
 #endif
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -136,7 +136,11 @@ TEST_CASE("kepE diff")
 
 TEST_CASE("kepE overloads")
 {
-    auto k = kepE("x"_var, 1.1);
+    auto k = kepE("x"_var, 1.1f);
+    REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1f});
+
+    k = kepE("x"_var, 1.1);
     REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
     REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1});
 
@@ -150,12 +154,16 @@ TEST_CASE("kepE overloads")
     REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{mppp::real128{"1.1"}});
 #endif
 
-#if defined(HEYOKA_HAVE_REAL128)
+#if defined(HEYOKA_HAVE_REAL)
     k = kepE("x"_var, 1.1_r256);
     REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
     REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1_r256});
 #endif
 
+    k = kepE(1.1f, "x"_var);
+    REQUIRE(std::get<func>(k.value()).args()[1] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[0].value()) == number{1.1f});
+
     k = kepE(1.1, "x"_var);
     REQUIRE(std::get<func>(k.value()).args()[1] == "x"_var);
     REQUIRE(std::get<number>(std::get<func>(k.value()).args()[0].value()) == number{1.1});
diff --git a/test/kepF.cpp b/test/kepF.cpp
index 668faba42..f6a1fbe6a 100644
--- a/test/kepF.cpp
+++ b/test/kepF.cpp
@@ -49,7 +49,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -230,6 +230,7 @@ TEST_CASE("kepF diff")
 
 TEST_CASE("kepF overloads")
 {
+    HEYOKA_TEST_KEPF_OVERLOAD(float);
     HEYOKA_TEST_KEPF_OVERLOAD(double);
     HEYOKA_TEST_KEPF_OVERLOAD(long double);
 
@@ -348,7 +349,7 @@ TEST_CASE("cfunc")
 
                     ins[i] = hval;
                     ins[i + batch_size] = kval;
-                    ins[i + 2u * batch_size] = lamval;
+                    ins[i + 2u * batch_size] = static_cast<fp_t>(lamval);
 
                     // Generate another pair of hs and ks for the pars.
                     std::tie(hval, kval) = generate_hk();
@@ -383,8 +384,8 @@ TEST_CASE("cfunc")
                     // Third output.
                     REQUIRE(!isnan(outs[i + batch_size * 2u]));
                     Fval = outs[i + batch_size * 2u];
-                    hval = .5;
-                    kval = .3;
+                    hval = fp_t(.5);
+                    kval = fp_t(.3);
                     lamval = ins[i + 2u * batch_size];
                     REQUIRE(eps_close(cos(lamval), cos(Fval + hval * cos(Fval) - kval * sin(Fval))));
                     REQUIRE(eps_close(sin(lamval), sin(Fval + hval * cos(Fval) - kval * sin(Fval))));
diff --git a/test/llvm_helpers.cpp b/test/llvm_helpers.cpp
index f46b6f0c4..69bab4988 100644
--- a/test/llvm_helpers.cpp
+++ b/test/llvm_helpers.cpp
@@ -54,7 +54,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -227,7 +227,7 @@ TEST_CASE("sgn batch")
 
                 std::uniform_real_distribution<double> rdist(-10., 10.);
                 std::vector<fp_t> values(batch_size);
-                std::generate(values.begin(), values.end(), [&rdist]() { return rdist(rng); });
+                std::generate(values.begin(), values.end(), [&rdist]() { return static_cast<fp_t>(rdist(rng)); });
                 std::vector<std::int32_t> signs(batch_size);
 
                 f_ptr(signs.data(), values.data());
@@ -360,7 +360,7 @@ TEST_CASE("sincos batch")
                 // Setup the argument and the output values.
                 std::vector<fp_t> x_vec(batch_size), s_vec(x_vec), c_vec(x_vec);
                 for (auto i = 0u; i < batch_size; ++i) {
-                    x_vec[i] = i + 1u;
+                    x_vec[i] = static_cast<fp_t>(i + 1u);
                 }
 
                 f_ptr(x_vec.data(), s_vec.data(), c_vec.data());
@@ -437,7 +437,6 @@ TEST_CASE("sincos mp")
 TEST_CASE("inv_kep_E_scalar")
 {
     using detail::llvm_add_inv_kep_E_wrapper;
-    namespace bmt = boost::math::tools;
     using std::cos;
     using std::isnan;
     using std::sin;
@@ -461,7 +460,7 @@ TEST_CASE("inv_kep_E_scalar")
 
             // First set of tests with zero eccentricity.
             for (auto i = 0; i < ntrials; ++i) {
-                const fp_t M = M_dist(rng);
+                const fp_t M = static_cast<fp_t>(M_dist(rng));
                 const fp_t e = 0;
                 fp_t E;
 
@@ -472,8 +471,8 @@ TEST_CASE("inv_kep_E_scalar")
 
             // Non-zero eccentricities.
             for (auto i = 0; i < ntrials * 10; ++i) {
-                const fp_t M = M_dist(rng);
-                const fp_t e = e_dist(rng);
+                const fp_t M = static_cast<fp_t>(M_dist(rng));
+                const fp_t e = static_cast<fp_t>(e_dist(rng));
                 fp_t E;
 
                 f_ptr(&E, &e, &M);
@@ -494,8 +493,8 @@ TEST_CASE("inv_kep_E_scalar")
 
             // Test invalid inputs.
             {
-                fp_t M = 1.23;
-                fp_t e = -.1;
+                fp_t M = static_cast<fp_t>(1.23);
+                fp_t e = static_cast<fp_t>(-.1);
                 fp_t E;
 
                 f_ptr(&E, &e, &M);
@@ -504,7 +503,7 @@ TEST_CASE("inv_kep_E_scalar")
             }
 
             {
-                fp_t M = 1.23;
+                fp_t M = static_cast<fp_t>(1.23);
                 fp_t e = 1.;
                 fp_t E;
 
@@ -514,7 +513,7 @@ TEST_CASE("inv_kep_E_scalar")
             }
 
             {
-                fp_t M = 1.23;
+                fp_t M = static_cast<fp_t>(1.23);
                 fp_t e = std::numeric_limits<fp_t>::infinity();
                 fp_t E;
 
@@ -524,7 +523,7 @@ TEST_CASE("inv_kep_E_scalar")
             }
 
             {
-                fp_t M = 1.23;
+                fp_t M = static_cast<fp_t>(1.23);
                 fp_t e = -std::numeric_limits<fp_t>::infinity();
                 fp_t E;
 
@@ -534,7 +533,7 @@ TEST_CASE("inv_kep_E_scalar")
             }
 
             {
-                fp_t M = 1.23;
+                fp_t M = static_cast<fp_t>(1.23);
                 fp_t e = std::numeric_limits<fp_t>::quiet_NaN();
                 fp_t E;
 
@@ -545,7 +544,7 @@ TEST_CASE("inv_kep_E_scalar")
 
             {
                 fp_t M = std::numeric_limits<fp_t>::infinity();
-                fp_t e = .1;
+                fp_t e = static_cast<fp_t>(.1);
                 fp_t E;
 
                 f_ptr(&E, &e, &M);
@@ -555,7 +554,7 @@ TEST_CASE("inv_kep_E_scalar")
 
             {
                 fp_t M = -std::numeric_limits<fp_t>::infinity();
-                fp_t e = .2;
+                fp_t e = static_cast<fp_t>(.2);
                 fp_t E;
 
                 f_ptr(&E, &e, &M);
@@ -565,7 +564,7 @@ TEST_CASE("inv_kep_E_scalar")
 
             {
                 fp_t M = std::numeric_limits<fp_t>::quiet_NaN();
-                fp_t e = .1;
+                fp_t e = static_cast<fp_t>(.1);
                 fp_t E;
 
                 f_ptr(&E, &e, &M);
@@ -581,7 +580,6 @@ TEST_CASE("inv_kep_E_scalar")
 TEST_CASE("inv_kep_E_batch")
 {
     using detail::llvm_add_inv_kep_E_wrapper;
-    namespace bmt = boost::math::tools;
     using std::cos;
     using std::isnan;
     using std::sin;
@@ -610,7 +608,7 @@ TEST_CASE("inv_kep_E_batch")
                 // First set of tests with zero eccentricity.
                 for (auto i = 0; i < ntrials; ++i) {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        M_vec[j] = M_dist(rng);
+                        M_vec[j] = static_cast<fp_t>(M_dist(rng));
                     }
                     f_ptr(ret_vec.data(), e_vec.data(), M_vec.data());
 
@@ -622,8 +620,8 @@ TEST_CASE("inv_kep_E_batch")
                 // Non-zero eccentricities.
                 for (auto i = 0; i < ntrials * 10; ++i) {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        M_vec[j] = M_dist(rng);
-                        e_vec[j] = e_dist(rng);
+                        M_vec[j] = static_cast<fp_t>(M_dist(rng));
+                        e_vec[j] = static_cast<fp_t>(e_dist(rng));
                     }
                     f_ptr(ret_vec.data(), e_vec.data(), M_vec.data());
 
@@ -635,12 +633,12 @@ TEST_CASE("inv_kep_E_batch")
                 // Test invalid inputs.
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        M_vec[j] = M_dist(rng);
+                        M_vec[j] = static_cast<fp_t>(M_dist(rng));
 
                         if (j == 1u) {
-                            e_vec[j] = -.1;
+                            e_vec[j] = static_cast<fp_t>(-.1);
                         } else {
-                            e_vec[j] = e_dist(rng);
+                            e_vec[j] = static_cast<fp_t>(e_dist(rng));
                         }
                     }
 
@@ -657,12 +655,12 @@ TEST_CASE("inv_kep_E_batch")
 
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        M_vec[j] = M_dist(rng);
+                        M_vec[j] = static_cast<fp_t>(M_dist(rng));
 
                         if (j == 1u) {
                             e_vec[j] = 1;
                         } else {
-                            e_vec[j] = e_dist(rng);
+                            e_vec[j] = static_cast<fp_t>(e_dist(rng));
                         }
                     }
 
@@ -679,12 +677,12 @@ TEST_CASE("inv_kep_E_batch")
 
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        M_vec[j] = M_dist(rng);
+                        M_vec[j] = static_cast<fp_t>(M_dist(rng));
 
                         if (j == 1u) {
                             e_vec[j] = std::numeric_limits<fp_t>::infinity();
                         } else {
-                            e_vec[j] = e_dist(rng);
+                            e_vec[j] = static_cast<fp_t>(e_dist(rng));
                         }
                     }
 
@@ -701,12 +699,12 @@ TEST_CASE("inv_kep_E_batch")
 
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        M_vec[j] = M_dist(rng);
+                        M_vec[j] = static_cast<fp_t>(M_dist(rng));
 
                         if (j == 1u) {
                             e_vec[j] = -std::numeric_limits<fp_t>::infinity();
                         } else {
-                            e_vec[j] = e_dist(rng);
+                            e_vec[j] = static_cast<fp_t>(e_dist(rng));
                         }
                     }
 
@@ -723,12 +721,12 @@ TEST_CASE("inv_kep_E_batch")
 
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        M_vec[j] = M_dist(rng);
+                        M_vec[j] = static_cast<fp_t>(M_dist(rng));
 
                         if (j == 1u) {
                             e_vec[j] = std::numeric_limits<fp_t>::quiet_NaN();
                         } else {
-                            e_vec[j] = e_dist(rng);
+                            e_vec[j] = static_cast<fp_t>(e_dist(rng));
                         }
                     }
 
@@ -745,12 +743,12 @@ TEST_CASE("inv_kep_E_batch")
 
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        e_vec[j] = e_dist(rng);
+                        e_vec[j] = static_cast<fp_t>(e_dist(rng));
 
                         if (j == 1u) {
                             M_vec[j] = std::numeric_limits<fp_t>::infinity();
                         } else {
-                            M_vec[j] = M_dist(rng);
+                            M_vec[j] = static_cast<fp_t>(M_dist(rng));
                         }
                     }
 
@@ -767,12 +765,12 @@ TEST_CASE("inv_kep_E_batch")
 
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        e_vec[j] = e_dist(rng);
+                        e_vec[j] = static_cast<fp_t>(e_dist(rng));
 
                         if (j == 1u) {
                             M_vec[j] = -std::numeric_limits<fp_t>::infinity();
                         } else {
-                            M_vec[j] = M_dist(rng);
+                            M_vec[j] = static_cast<fp_t>(M_dist(rng));
                         }
                     }
 
@@ -789,12 +787,12 @@ TEST_CASE("inv_kep_E_batch")
 
                 {
                     for (auto j = 0u; j < batch_size; ++j) {
-                        e_vec[j] = e_dist(rng);
+                        e_vec[j] = static_cast<fp_t>(e_dist(rng));
 
                         if (j == 1u) {
                             M_vec[j] = std::numeric_limits<fp_t>::quiet_NaN();
                         } else {
-                            M_vec[j] = M_dist(rng);
+                            M_vec[j] = static_cast<fp_t>(M_dist(rng));
                         }
                     }
 
@@ -1408,13 +1406,13 @@ TEST_CASE("minmax")
                         if (idist(rng) && idist(rng) && idist(rng)) {
                             av[j] = std::numeric_limits<fp_t>::quiet_NaN();
                         } else {
-                            av[j] = rdist(rng);
+                            av[j] = static_cast<fp_t>(rdist(rng));
                         }
 
                         if (idist(rng) && idist(rng) && idist(rng)) {
                             bv[j] = std::numeric_limits<fp_t>::quiet_NaN();
                         } else {
-                            bv[j] = rdist(rng);
+                            bv[j] = static_cast<fp_t>(rdist(rng));
                         }
                     }
 
@@ -1569,9 +1567,9 @@ TEST_CASE("fma batch")
                 // Setup the arguments and the output value.
                 std::vector<fp_t> ret_vec(batch_size), a_vec(ret_vec), b_vec(ret_vec), c_vec(ret_vec);
                 for (auto i = 0u; i < batch_size; ++i) {
-                    a_vec[i] = i + 1u;
-                    b_vec[i] = a_vec[i] * 10 * (i + 1u);
-                    c_vec[i] = b_vec[i] * 10 * (i + 1u);
+                    a_vec[i] = static_cast<fp_t>(i + 1u);
+                    b_vec[i] = a_vec[i] * 10 * static_cast<fp_t>(i + 1u);
+                    c_vec[i] = b_vec[i] * 10 * static_cast<fp_t>(i + 1u);
                 }
 
                 f_ptr(ret_vec.data(), a_vec.data(), b_vec.data(), c_vec.data());
@@ -2610,7 +2608,7 @@ TEST_CASE("dl modulus scalar")
                 using mp_fp_t
                     = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
 
-                std::uniform_real_distribution<fp_t> op_dist(-1e6, 1e6), quo_dist(.1, 10.);
+                std::uniform_real_distribution<fp_t> op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.));
 
                 for (auto i = 0; i < ntrials; ++i) {
                     auto x = fp_t(op_dist(rng)), y = fp_t(quo_dist(rng));
@@ -2695,7 +2693,7 @@ TEST_CASE("dl modulus batch")
                     using mp_fp_t
                         = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
 
-                    std::uniform_real_distribution<fp_t> op_dist(-1e6, 1e6), quo_dist(.1, 10.);
+                    std::uniform_real_distribution<fp_t> op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.));
 
                     std::vector<fp_t> x_vec(batch_size), y_vec(x_vec), a_hi_vec(x_vec), a_lo_vec(x_vec),
                         b_hi_vec(x_vec), b_lo_vec(x_vec);
@@ -2739,7 +2737,10 @@ TEST_CASE("get_alignment")
     auto &context = s.context();
     auto &builder = s.builder();
 
-    auto *tp = detail::to_llvm_type<double>(context);
+    auto *tp = detail::to_llvm_type<float>(context);
+    REQUIRE(detail::get_alignment(md, tp) == alignof(float));
+
+    tp = detail::to_llvm_type<double>(context);
     REQUIRE(detail::get_alignment(md, tp) == alignof(double));
 
 #if !defined(HEYOKA_ARCH_PPC)
diff --git a/test/llvm_state.cpp b/test/llvm_state.cpp
index e3ebbbdbe..1292af5db 100644
--- a/test/llvm_state.cpp
+++ b/test/llvm_state.cpp
@@ -49,6 +49,7 @@ using namespace heyoka_test;
 
 TEST_CASE("simd size")
 {
+    REQUIRE(recommended_simd_size<float>() > 0u);
     REQUIRE(recommended_simd_size<double>() > 0u);
     REQUIRE(recommended_simd_size<long double>() > 0u);
 
@@ -63,6 +64,7 @@ TEST_CASE("simd size")
 #if defined(__GNUC__)
 
 #if defined(__amd64__) || defined(__aarch64__)
+    REQUIRE(recommended_simd_size<float>() >= 4u);
     REQUIRE(recommended_simd_size<double>() >= 2u);
 #endif
 
diff --git a/test/log.cpp b/test/log.cpp
index 136110ac8..4d5a88626 100644
--- a/test/log.cpp
+++ b/test/log.cpp
@@ -49,7 +49,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -210,8 +210,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(log(x), {{x, .1_dbl}})) == log(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -274,3 +274,69 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {log(a), log(b), log(c), log(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::log(1.f)));
+    REQUIRE(outs[1] == approximately(std::log(2.f)));
+    REQUIRE(outs[2] == approximately(std::log(3.f)));
+    REQUIRE(outs[3] == approximately(std::log(4.f)));
+
+#if defined(HEYOKA_WITH_SLEEF)
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+#if LLVM_VERSION_MAJOR >= 16
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+#endif
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/neg.cpp b/test/neg.cpp
index 558d6dce1..d724d413f 100644
--- a/test/neg.cpp
+++ b/test/neg.cpp
@@ -46,7 +46,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/pow.cpp b/test/pow.cpp
index 8fa48ab62..27fbfbd87 100644
--- a/test/pow.cpp
+++ b/test/pow.cpp
@@ -66,7 +66,7 @@ using namespace mppp::literals;
 
 #endif
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -418,8 +418,35 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(pow(x, y), {{x, x * y}, {y, -2_dbl}})) == pow(x, -2.) * pow(y, -2.));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+TEST_CASE("pow overloads")
+{
+    auto k = pow("x"_var, 1.1f);
+    REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1f});
+
+    k = pow("x"_var, 1.1);
+    REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1});
+
+    k = pow("x"_var, 1.1l);
+    REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1l});
+
+#if defined(HEYOKA_HAVE_REAL128)
+    k = pow("x"_var, mppp::real128{"1.1"});
+    REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{mppp::real128{"1.1"}});
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+    k = pow("x"_var, 1.1_r256);
+    REQUIRE(std::get<func>(k.value()).args()[0] == "x"_var);
+    REQUIRE(std::get<number>(std::get<func>(k.value()).args()[1].value()) == number{1.1_r256});
+#endif
+}
+
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -482,3 +509,68 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {pow(a, .6f), pow(b, .7f), pow(c, .8f), pow(d, .9f)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{.1f, .2f, .3f, .4f};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::pow(.1f, .6f)));
+    REQUIRE(outs[1] == approximately(std::pow(.2f, .7f)));
+    REQUIRE(outs[2] == approximately(std::pow(.3f, .8f)));
+    REQUIRE(outs[3] == approximately(std::pow(.4f, .9f)));
+
+#if defined(HEYOKA_WITH_SLEEF)
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+#if LLVM_VERSION_MAJOR >= 16
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+#endif
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/prod.cpp b/test/prod.cpp
index 43ae82b97..b6b93b615 100644
--- a/test/prod.cpp
+++ b/test/prod.cpp
@@ -54,7 +54,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/relu.cpp b/test/relu.cpp
index 51e72c134..810cb705a 100644
--- a/test/relu.cpp
+++ b/test/relu.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -412,11 +412,11 @@ TEST_CASE("cfunc")
             for (auto niter = 0; niter < 100; ++niter) {
                 for (auto i = 0u; i < batch_size; ++i) {
                     // Generate the xs.
-                    ins[i] = x_dist(rng);
+                    ins[i] = static_cast<fp_t>(x_dist(rng));
 
                     // Generate the pars.
-                    pars[i] = x_dist(rng);
-                    pars[i + batch_size] = x_dist(rng);
+                    pars[i] = static_cast<fp_t>(x_dist(rng));
+                    pars[i + batch_size] = static_cast<fp_t>(x_dist(rng));
                 }
 
                 cf_ptr(outs.data(), ins.data(), pars.data(), nullptr);
@@ -480,11 +480,11 @@ TEST_CASE("cfunc leaky")
             for (auto niter = 0; niter < 100; ++niter) {
                 for (auto i = 0u; i < batch_size; ++i) {
                     // Generate the xs.
-                    ins[i] = x_dist(rng);
+                    ins[i] = static_cast<fp_t>(x_dist(rng));
 
                     // Generate the pars.
-                    pars[i] = x_dist(rng);
-                    pars[i + batch_size] = x_dist(rng);
+                    pars[i] = static_cast<fp_t>(x_dist(rng));
+                    pars[i + batch_size] = static_cast<fp_t>(x_dist(rng));
                 }
 
                 cf_ptr(outs.data(), ins.data(), pars.data(), nullptr);
diff --git a/test/sigmoid.cpp b/test/sigmoid.cpp
index c65d31a2e..b4e05d865 100644
--- a/test/sigmoid.cpp
+++ b/test/sigmoid.cpp
@@ -47,7 +47,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/sin.cpp b/test/sin.cpp
index b9b606414..8948d49db 100644
--- a/test/sin.cpp
+++ b/test/sin.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -112,6 +112,7 @@ TEST_CASE("sin number simpl")
     auto [x] = make_vars("x");
 
     REQUIRE(sin(x * 0.) == 0_dbl);
+    REQUIRE(sin(0.123_flt) == expression{sin(0.123f)});
     REQUIRE(sin(0.123_dbl) == expression{sin(0.123)});
     REQUIRE(sin(-0.123_ldbl) == expression{sin(-0.123l)});
 
@@ -228,8 +229,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(sin(x), {{x, .1_dbl}})) == sin(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -391,3 +392,178 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {sin(a), sin(b), sin(c), sin(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::sin(1.f)));
+    REQUIRE(outs[1] == approximately(std::sin(2.f)));
+    REQUIRE(outs[2] == approximately(std::sin(3.f)));
+    REQUIRE(outs[3] == approximately(std::sin(4.f)));
+
+#if defined(HEYOKA_WITH_SLEEF)
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+#if LLVM_VERSION_MAJOR >= 16
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+#endif
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+    // Some more extensive testing specific to x86, only for this function.
+    auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i");
+
+    llvm_state s2{kw::slp_vectorize = true};
+
+    add_cfunc<float>(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h)});
+    add_cfunc<float>(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h), sin(i)});
+
+    s2.compile();
+
+    auto *cf1_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc1"));
+    auto *cf2_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc2"));
+
+    const std::vector<float> ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.};
+    std::vector<float> outs2(9u, 0.);
+
+    cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+
+    REQUIRE(outs2[0] == approximately(std::sin(1.f)));
+    REQUIRE(outs2[1] == approximately(std::sin(2.f)));
+    REQUIRE(outs2[2] == approximately(std::sin(3.f)));
+    REQUIRE(outs2[3] == approximately(std::sin(4.f)));
+    REQUIRE(outs2[4] == approximately(std::sin(5.f)));
+    REQUIRE(outs2[5] == approximately(std::sin(6.f)));
+    REQUIRE(outs2[6] == approximately(std::sin(7.f)));
+    REQUIRE(outs2[7] == approximately(std::sin(8.f)));
+
+    cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+
+    REQUIRE(outs2[0] == approximately(std::sin(1.f)));
+    REQUIRE(outs2[1] == approximately(std::sin(2.f)));
+    REQUIRE(outs2[2] == approximately(std::sin(3.f)));
+    REQUIRE(outs2[3] == approximately(std::sin(4.f)));
+    REQUIRE(outs2[4] == approximately(std::sin(5.f)));
+    REQUIRE(outs2[5] == approximately(std::sin(6.f)));
+    REQUIRE(outs2[6] == approximately(std::sin(7.f)));
+    REQUIRE(outs2[7] == approximately(std::sin(8.f)));
+    REQUIRE(outs2[8] == approximately(std::sin(9.f)));
+
+    ir = s2.get_ir();
+
+    count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    if (tf.avx) {
+        // NOTE: occurrences of the scalar version:
+        // - 8 + 9 calls in the strided cfuncs,
+        // - 1 declaration,
+        // - 1 call to deal with the remainder in the
+        //   9-argument version.
+        REQUIRE(count == 19u);
+    }
+
+    // NOTE: this next test seems to work properly starting
+    // from LLVM 13.
+#if LLVM_VERSION_MAJOR >= 13
+
+    // Check that the autovec works also on batch sizes which do not correspond
+    // exactly to an available vector width.
+    llvm_state s3{kw::slp_vectorize = true};
+
+    add_cfunc<float>(s3, "cfunc", {sin(a)}, kw::batch_size = 5u);
+
+    s3.compile();
+
+    auto *cf3_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
+
+    std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
+
+    cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+
+    REQUIRE(outs3[0] == approximately(std::sin(1.f)));
+    REQUIRE(outs3[1] == approximately(std::sin(2.f)));
+    REQUIRE(outs3[2] == approximately(std::sin(3.f)));
+    REQUIRE(outs3[3] == approximately(std::sin(4.f)));
+    REQUIRE(outs3[4] == approximately(std::sin(5.f)));
+
+    ir = s3.get_ir();
+
+    count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 1 call in the remainder of the unstrided cfunc,
+        // - 1 call in the remainder of the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 3u);
+    }
+
+#if LLVM_VERSION_MAJOR >= 16
+
+    if (tf.aarch64) {
+        REQUIRE(count == 3u);
+    }
+
+#endif
+
+#endif
+
+#endif
+}
diff --git a/test/sinh.cpp b/test/sinh.cpp
index adaee85d4..a0a218c82 100644
--- a/test/sinh.cpp
+++ b/test/sinh.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -220,8 +220,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(sinh(x), {{x, 1.5_dbl}})) == sinh(1.5_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -370,3 +370,165 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {sinh(a), sinh(b), sinh(c), sinh(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::sinh(1.f)));
+    REQUIRE(outs[1] == approximately(std::sinh(2.f)));
+    REQUIRE(outs[2] == approximately(std::sinh(3.f)));
+    REQUIRE(outs[3] == approximately(std::sinh(4.f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 3u);
+    // }
+
+    // Some more extensive testing specific to x86, only for this function.
+    auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i");
+
+    llvm_state s2{kw::slp_vectorize = true};
+
+    add_cfunc<float>(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h)});
+    add_cfunc<float>(s2, "cfunc2", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h), sinh(i)});
+
+    s2.compile();
+
+    auto *cf1_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc1"));
+    auto *cf2_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc2"));
+
+    const std::vector<float> ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.};
+    std::vector<float> outs2(9u, 0.);
+
+    cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+
+    REQUIRE(outs2[0] == approximately(std::sinh(1.f)));
+    REQUIRE(outs2[1] == approximately(std::sinh(2.f)));
+    REQUIRE(outs2[2] == approximately(std::sinh(3.f)));
+    REQUIRE(outs2[3] == approximately(std::sinh(4.f)));
+    REQUIRE(outs2[4] == approximately(std::sinh(5.f)));
+    REQUIRE(outs2[5] == approximately(std::sinh(6.f)));
+    REQUIRE(outs2[6] == approximately(std::sinh(7.f)));
+    REQUIRE(outs2[7] == approximately(std::sinh(8.f)));
+
+    cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+
+    REQUIRE(outs2[0] == approximately(std::sinh(1.f)));
+    REQUIRE(outs2[1] == approximately(std::sinh(2.f)));
+    REQUIRE(outs2[2] == approximately(std::sinh(3.f)));
+    REQUIRE(outs2[3] == approximately(std::sinh(4.f)));
+    REQUIRE(outs2[4] == approximately(std::sinh(5.f)));
+    REQUIRE(outs2[5] == approximately(std::sinh(6.f)));
+    REQUIRE(outs2[6] == approximately(std::sinh(7.f)));
+    REQUIRE(outs2[7] == approximately(std::sinh(8.f)));
+    REQUIRE(outs2[8] == approximately(std::sinh(9.f)));
+
+    ir = s2.get_ir();
+
+    count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    if (tf.avx) {
+        // NOTE: occurrences of the scalar version:
+        // - 8 + 9 calls in the strided cfuncs,
+        // - 1 declaration,
+        // - 1 call to deal with the remainder in the
+        //   9-argument version.
+        REQUIRE(count == 19u);
+    }
+
+    // Check that the autovec works also on batch sizes which do not correspond
+    // exactly to an available vector width.
+    llvm_state s3{kw::slp_vectorize = true};
+
+    add_cfunc<float>(s3, "cfunc", {sinh(a)}, kw::batch_size = 5u);
+
+    s3.compile();
+
+    auto *cf3_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
+
+    std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
+
+    cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+
+    REQUIRE(outs3[0] == approximately(std::sinh(1.f)));
+    REQUIRE(outs3[1] == approximately(std::sinh(2.f)));
+    REQUIRE(outs3[2] == approximately(std::sinh(3.f)));
+    REQUIRE(outs3[3] == approximately(std::sinh(4.f)));
+    REQUIRE(outs3[4] == approximately(std::sinh(5.f)));
+
+    ir = s3.get_ir();
+
+    count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 1 call in the remainder of the unstrided cfunc,
+        // - 1 call in the remainder of the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 3u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 3u);
+    }
+
+#endif
+}
diff --git a/test/sqrt.cpp b/test/sqrt.cpp
index b4e28b000..daa6a55f3 100644
--- a/test/sqrt.cpp
+++ b/test/sqrt.cpp
@@ -50,7 +50,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -206,8 +206,8 @@ TEST_CASE("cfunc_mp")
 
 #endif
 
-// Test to check vectorisation.
-TEST_CASE("slp vect")
+// Tests to check vectorisation.
+TEST_CASE("slp vect double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -270,3 +270,69 @@ TEST_CASE("slp vect")
 
 #endif
 }
+
+TEST_CASE("slp vect float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {sqrt(a), sqrt(b), sqrt(c), sqrt(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::sqrt(1.f)));
+    REQUIRE(outs[1] == approximately(std::sqrt(2.f)));
+    REQUIRE(outs[2] == approximately(std::sqrt(3.f)));
+    REQUIRE(outs[3] == approximately(std::sqrt(4.f)));
+
+#if defined(HEYOKA_WITH_SLEEF)
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sqrt.f32", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+#if LLVM_VERSION_MAJOR >= 16
+
+    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+#endif
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/square.cpp b/test/square.cpp
index f9fcd8db2..cb71f3e26 100644
--- a/test/square.cpp
+++ b/test/square.cpp
@@ -55,7 +55,7 @@ auto square_wrapper(const heyoka::expression &x)
     return pow(x, 2.);
 }
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -154,7 +154,7 @@ TEST_CASE("cfunc")
 
             for (auto i = 0u; i < batch_size; ++i) {
                 REQUIRE(outs[i] == approximately(ins[i] * ins[i], fp_t(100)));
-                REQUIRE(outs[i + batch_size] == approximately(static_cast<fp_t>(.5) * .5, fp_t(100)));
+                REQUIRE(outs[i + batch_size] == approximately(static_cast<fp_t>(.5) * fp_t(.5), fp_t(100)));
                 REQUIRE(outs[i + 2u * batch_size] == approximately(pars[i] * pars[i], fp_t(100)));
             }
         }
diff --git a/test/sub.cpp b/test/sub.cpp
index 3affb2a0b..6897268cd 100644
--- a/test/sub.cpp
+++ b/test/sub.cpp
@@ -46,7 +46,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/sum.cpp b/test/sum.cpp
index 4df57a27d..f3544ce5a 100644
--- a/test/sum.cpp
+++ b/test/sum.cpp
@@ -56,7 +56,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/sum_sq.cpp b/test/sum_sq.cpp
index 617ee3b41..65e137187 100644
--- a/test/sum_sq.cpp
+++ b/test/sum_sq.cpp
@@ -67,7 +67,7 @@ auto sum_sq(const std::vector<expression> &args)
     return sum(new_args);
 }
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -238,7 +238,7 @@ TEST_CASE("cfunc")
                 REQUIRE(outs[i]
                         == approximately(ins[i] * ins[i] + ins[i + batch_size] * ins[i + batch_size], fp_t(100)));
                 REQUIRE(outs[i + batch_size]
-                        == approximately(ins[i] * ins[i] + static_cast<fp_t>(.5) * 0.5, fp_t(100)));
+                        == approximately(ins[i] * ins[i] + static_cast<fp_t>(.5) * fp_t(0.5), fp_t(100)));
                 REQUIRE(outs[i + 2u * batch_size]
                         == approximately(pars[i] * pars[i] + ins[i + batch_size] * ins[i + batch_size], fp_t(100)));
             }
diff --git a/test/tan.cpp b/test/tan.cpp
index 64bfc6750..66788af21 100644
--- a/test/tan.cpp
+++ b/test/tan.cpp
@@ -49,7 +49,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -219,8 +219,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(tan(x), {{x, .1_dbl}})) == tan(.1_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -280,3 +280,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {tan(a), tan(b), tan(c), tan(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::tan(1.f)));
+    REQUIRE(outs[1] == approximately(std::tan(2.f)));
+    REQUIRE(outs[2] == approximately(std::tan(3.f)));
+    REQUIRE(outs[3] == approximately(std::tan(4.f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/tanh.cpp b/test/tanh.cpp
index 52c9ca51b..43b6f6b2b 100644
--- a/test/tanh.cpp
+++ b/test/tanh.cpp
@@ -49,7 +49,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -220,8 +220,8 @@ TEST_CASE("normalise")
     REQUIRE(normalise(subs(tanh(x), {{x, -.5_dbl}})) == tanh(-.5_dbl));
 }
 
-// Test to check vectorisation via the vector-function-abi-variant machinery.
-TEST_CASE("vfabi")
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
 {
     llvm_state s{kw::slp_vectorize = true};
 
@@ -281,3 +281,66 @@ TEST_CASE("vfabi")
 
 #endif
 }
+
+TEST_CASE("vfabi float")
+{
+    llvm_state s{kw::slp_vectorize = true};
+
+    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+
+    add_cfunc<float>(s, "cfunc", {tanh(a), tanh(b), tanh(c), tanh(d)});
+
+    s.compile();
+
+    auto *cf_ptr
+        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+
+    const std::vector<float> ins{1., 2., 3., 4.};
+    std::vector<float> outs(4u, 0.);
+
+    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+
+    REQUIRE(outs[0] == approximately(std::tanh(1.f)));
+    REQUIRE(outs[1] == approximately(std::tanh(2.f)));
+    REQUIRE(outs[2] == approximately(std::tanh(3.f)));
+    REQUIRE(outs[3] == approximately(std::tanh(4.f)));
+
+    // NOTE: autovec with external scalar functions seems to work
+    // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+    const auto &tf = detail::get_target_features();
+
+    auto ir = s.get_ir();
+
+    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+    auto count = 0u;
+    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanhf", boost::is_iequal()));
+         it != string_find_iterator(); ++it) {
+        ++count;
+    }
+
+    // NOTE: at the moment we have comprehensive coverage of LLVM versions
+    // in the CI only for x86_64.
+    if (tf.sse2) {
+        // NOTE: occurrences of the scalar version:
+        // - 4 calls in the strided cfunc,
+        // - 1 declaration.
+        REQUIRE(count == 5u);
+    }
+
+    if (tf.aarch64) {
+        REQUIRE(count == 5u);
+    }
+
+    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+    // to the way the target machine is being set up by orc/lljit (it works
+    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+    // if (tf.vsx) {
+    //     REQUIRE(count == 5u);
+    // }
+
+#endif
+}
diff --git a/test/taylor_acos.cpp b/test/taylor_acos.cpp
index e1c25fb60..05905e285 100644
--- a/test/taylor_acos.cpp
+++ b/test/taylor_acos.cpp
@@ -40,7 +40,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -237,7 +237,7 @@ TEST_CASE("taylor acos")
             std::vector<fp_t> jet{fp_t{2}, fp_t{-4}, fp_t{3}, fp_t{5}};
             jet.resize(8);
 
-            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t{.3}, fp_t{.3}};
+            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t(.3), fp_t(.3)};
 
             jptr(jet.data(), pars.data(), nullptr);
 
@@ -247,8 +247,8 @@ TEST_CASE("taylor acos")
             REQUIRE(jet[2] == 3);
             REQUIRE(jet[3] == 5);
 
-            REQUIRE(jet[4] == approximately(acos(fp_t{.3})));
-            REQUIRE(jet[5] == approximately(acos(fp_t{.3})));
+            REQUIRE(jet[4] == approximately(acos(fp_t(.3))));
+            REQUIRE(jet[5] == approximately(acos(fp_t(.3))));
 
             REQUIRE(jet[6] == approximately(jet[0] + jet[2]));
             REQUIRE(jet[7] == approximately(jet[1] + jet[3]));
@@ -424,13 +424,13 @@ TEST_CASE("taylor acos")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(acos(jet[1])));
             REQUIRE(jet[3] == approximately(acos(jet[0])));
         }
@@ -444,16 +444,16 @@ TEST_CASE("taylor acos")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(acos(jet[2])));
             REQUIRE(jet[5] == approximately(acos(jet[3])));
@@ -471,13 +471,13 @@ TEST_CASE("taylor acos")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(acos(jet[1])));
             REQUIRE(jet[3] == approximately(acos(jet[0])));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * (-1 / sqrt(1 - jet[1] * jet[1]) * jet[3])));
@@ -493,16 +493,16 @@ TEST_CASE("taylor acos")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(acos(jet[2])));
             REQUIRE(jet[5] == approximately(acos(jet[3])));
@@ -526,18 +526,18 @@ TEST_CASE("taylor acos")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{-.5}, fp_t{.3}, fp_t{-.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t{-.5}, fp_t(.3), fp_t(-.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
-            REQUIRE(jet[2] == -.5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
+            REQUIRE(jet[2] == fp_t(-.5));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == -.4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(-.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(acos(jet[3])));
             REQUIRE(jet[7] == approximately(acos(jet[4])));
diff --git a/test/taylor_acosh.cpp b/test/taylor_acosh.cpp
index 1fc5fcf63..12e3dbb04 100644
--- a/test/taylor_acosh.cpp
+++ b/test/taylor_acosh.cpp
@@ -40,7 +40,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -267,7 +267,7 @@ TEST_CASE("taylor acosh")
             std::vector<fp_t> jet{fp_t{2}, fp_t{-4}, fp_t{3}, fp_t{5}};
             jet.resize(8);
 
-            std::vector<fp_t> pars{fp_t{1.5}, fp_t{1.5}, fp_t{1.3}, fp_t{1.3}};
+            std::vector<fp_t> pars{fp_t{1.5}, fp_t{1.5}, fp_t(1.3), fp_t(1.3)};
 
             jptr(jet.data(), pars.data(), nullptr);
 
@@ -277,8 +277,8 @@ TEST_CASE("taylor acosh")
             REQUIRE(jet[2] == 3);
             REQUIRE(jet[3] == 5);
 
-            REQUIRE(jet[4] == approximately(acosh(fp_t{1.3})));
-            REQUIRE(jet[5] == approximately(acosh(fp_t{1.3})));
+            REQUIRE(jet[4] == approximately(acosh(fp_t(1.3))));
+            REQUIRE(jet[5] == approximately(acosh(fp_t(1.3))));
 
             REQUIRE(jet[6] == approximately(jet[0] + jet[2]));
             REQUIRE(jet[7] == approximately(jet[1] + jet[3]));
@@ -454,13 +454,13 @@ TEST_CASE("taylor acosh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{1.2}, fp_t{1.3}};
+            std::vector<fp_t> jet{fp_t(1.2), fp_t(1.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == 1.2);
-            REQUIRE(jet[1] == 1.3);
+            REQUIRE(jet[0] == fp_t(1.2));
+            REQUIRE(jet[1] == fp_t(1.3));
             REQUIRE(jet[2] == approximately(acosh(jet[1])));
             REQUIRE(jet[3] == approximately(acosh(jet[0])));
         }
@@ -474,16 +474,16 @@ TEST_CASE("taylor acosh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{1.2}, fp_t{1.1}, fp_t{1.3}, fp_t{1.4}};
+            std::vector<fp_t> jet{fp_t(1.2), fp_t(1.1), fp_t(1.3), fp_t(1.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == 1.2);
-            REQUIRE(jet[1] == 1.1);
+            REQUIRE(jet[0] == fp_t(1.2));
+            REQUIRE(jet[1] == fp_t(1.1));
 
-            REQUIRE(jet[2] == 1.3);
-            REQUIRE(jet[3] == 1.4);
+            REQUIRE(jet[2] == fp_t(1.3));
+            REQUIRE(jet[3] == fp_t(1.4));
 
             REQUIRE(jet[4] == approximately(acosh(jet[2])));
             REQUIRE(jet[5] == approximately(acosh(jet[3])));
@@ -501,13 +501,13 @@ TEST_CASE("taylor acosh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{1.2}, fp_t{1.3}};
+            std::vector<fp_t> jet{fp_t(1.2), fp_t(1.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == 1.2);
-            REQUIRE(jet[1] == 1.3);
+            REQUIRE(jet[0] == fp_t(1.2));
+            REQUIRE(jet[1] == fp_t(1.3));
             REQUIRE(jet[2] == approximately(acosh(jet[1])));
             REQUIRE(jet[3] == approximately(acosh(jet[0])));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * (1 / sqrt(-1 + jet[1] * jet[1]) * jet[3])));
@@ -523,16 +523,16 @@ TEST_CASE("taylor acosh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{1.2}, fp_t{1.1}, fp_t{1.3}, fp_t{1.4}};
+            std::vector<fp_t> jet{fp_t(1.2), fp_t(1.1), fp_t(1.3), fp_t(1.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == 1.2);
-            REQUIRE(jet[1] == 1.1);
+            REQUIRE(jet[0] == fp_t(1.2));
+            REQUIRE(jet[1] == fp_t(1.1));
 
-            REQUIRE(jet[2] == 1.3);
-            REQUIRE(jet[3] == 1.4);
+            REQUIRE(jet[2] == fp_t(1.3));
+            REQUIRE(jet[3] == fp_t(1.4));
 
             REQUIRE(jet[4] == approximately(acosh(jet[2])));
             REQUIRE(jet[5] == approximately(acosh(jet[3])));
@@ -556,18 +556,18 @@ TEST_CASE("taylor acosh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{1.2}, fp_t{1.1}, fp_t{1.5}, fp_t{1.3}, fp_t{1.4}, fp_t{1.6}};
+            std::vector<fp_t> jet{fp_t(1.2), fp_t(1.1), fp_t{1.5}, fp_t(1.3), fp_t(1.4), fp_t(1.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == 1.2);
-            REQUIRE(jet[1] == 1.1);
-            REQUIRE(jet[2] == 1.5);
+            REQUIRE(jet[0] == fp_t(1.2));
+            REQUIRE(jet[1] == fp_t(1.1));
+            REQUIRE(jet[2] == fp_t(1.5));
 
-            REQUIRE(jet[3] == 1.3);
-            REQUIRE(jet[4] == 1.4);
-            REQUIRE(jet[5] == 1.6);
+            REQUIRE(jet[3] == fp_t(1.3));
+            REQUIRE(jet[4] == fp_t(1.4));
+            REQUIRE(jet[5] == fp_t(1.6));
 
             REQUIRE(jet[6] == approximately(acosh(jet[3])));
             REQUIRE(jet[7] == approximately(acosh(jet[4])));
diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp
index fcf151f5f..70ff84201 100644
--- a/test/taylor_adaptive.cpp
+++ b/test/taylor_adaptive.cpp
@@ -72,7 +72,7 @@ auto &horner_eval(Out &ret, const P &p, int order, const T &eval)
     return ret;
 }
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -1929,7 +1929,7 @@ TEST_CASE("callback ste")
 
         using ev_t = typename taylor_adaptive<fp_t>::t_event_t;
         auto ta = taylor_adaptive<fp_t>{
-            {prime(x) = v, prime(v) = -9.8 * sin(x)}, {-0.0001, 0.025}, kw::t_events = {ev_t(x)}};
+            {prime(x) = v, prime(v) = -9.8 * sin(x)}, {fp_t(-0.0001), fp_t(0.025)}, kw::t_events = {ev_t(x)}};
 
         int n_invoked = 0;
         auto pcb = [&n_invoked](auto &) {
diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp
index 1af4bf3b4..94233b0a8 100644
--- a/test/taylor_adaptive_batch.cpp
+++ b/test/taylor_adaptive_batch.cpp
@@ -55,7 +55,7 @@ using namespace heyoka;
 namespace hy = heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -1166,9 +1166,9 @@ TEST_CASE("stream output")
 
         {
             auto ta = taylor_adaptive_batch<fp_t>{{prime(x) = v - par[1], prime(v) = -9.8 * sin(x + par[0])},
-                                                  {0., 0.01, 0.5, 0.51},
+                                                  {fp_t(0.), fp_t(0.01), fp_t(0.5), fp_t(0.51)},
                                                   2u,
-                                                  kw::pars = std::vector<fp_t>{-1e-4, -1.1e-4}};
+                                                  kw::pars = std::vector{fp_t(-1e-4), fp_t(-1.1e-4)}};
 
             std::ostringstream oss;
 
@@ -1188,7 +1188,7 @@ TEST_CASE("stream output")
 
         {
             auto tad = taylor_adaptive_batch<fp_t>{{prime(x) = v - par[1], prime(v) = -9.8 * sin(x + par[0])},
-                                                   {0., 0.01, 0.5, 0.51},
+                                                   {fp_t(0.), fp_t(0.01), fp_t(0.5), fp_t(0.51)},
                                                    2u,
                                                    kw::t_events = {t_ev_t(x)}};
 
@@ -1205,7 +1205,7 @@ TEST_CASE("stream output")
         {
             auto tad
                 = taylor_adaptive_batch<fp_t>{{prime(x) = v - par[1], prime(v) = -9.8 * sin(x + par[0])},
-                                              {0., 0.01, 0.5, 0.51},
+                                              {fp_t(0.), fp_t(0.01), fp_t(0.5), fp_t(0.51)},
                                               2u,
                                               kw::nt_events = {nt_ev_t(x, [](auto &, fp_t, int, std::uint32_t) {})}};
 
@@ -1222,7 +1222,7 @@ TEST_CASE("stream output")
         {
             auto tad
                 = taylor_adaptive_batch<fp_t>{{prime(x) = v - par[1], prime(v) = -9.8 * sin(x + par[0])},
-                                              {0., 0.01, 0.5, 0.51},
+                                              {fp_t(0.), fp_t(0.01), fp_t(0.5), fp_t(0.51)},
                                               2u,
                                               kw::t_events = {t_ev_t(x)},
                                               kw::nt_events = {nt_ev_t(x, [](auto &, fp_t, int, std::uint32_t) {})}};
@@ -1816,10 +1816,11 @@ TEST_CASE("callback ste")
         auto [x, v] = make_vars("x", "v");
 
         using ev_t = typename taylor_adaptive_batch<fp_t>::t_event_t;
-        auto ta = taylor_adaptive_batch<fp_t>{{prime(x) = v, prime(v) = -9.8 * sin(x)},
-                                              {-1, -0.0001, -1, -1, 0.025, 0.026, 0.027, 0.028},
-                                              4,
-                                              kw::t_events = {ev_t(x)}};
+        auto ta = taylor_adaptive_batch<fp_t>{
+            {prime(x) = v, prime(v) = -9.8 * sin(x)},
+            {fp_t(-1), fp_t(-0.0001), fp_t(-1), fp_t(-1), fp_t(0.025), fp_t(0.026), fp_t(0.027), fp_t(0.028)},
+            4,
+            kw::t_events = {ev_t(x)}};
 
         int n_invoked = 0;
         auto pcb = [&n_invoked](auto &) {
diff --git a/test/taylor_asin.cpp b/test/taylor_asin.cpp
index 62dd2ab08..f538cca97 100644
--- a/test/taylor_asin.cpp
+++ b/test/taylor_asin.cpp
@@ -40,7 +40,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -235,7 +235,7 @@ TEST_CASE("taylor asin")
             std::vector<fp_t> jet{fp_t{2}, fp_t{-4}, fp_t{3}, fp_t{5}};
             jet.resize(8);
 
-            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t{.3}, fp_t{.3}};
+            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t(.3), fp_t(.3)};
 
             jptr(jet.data(), pars.data(), nullptr);
 
@@ -245,8 +245,8 @@ TEST_CASE("taylor asin")
             REQUIRE(jet[2] == 3);
             REQUIRE(jet[3] == 5);
 
-            REQUIRE(jet[4] == approximately(asin(fp_t{.3})));
-            REQUIRE(jet[5] == approximately(asin(fp_t{.3})));
+            REQUIRE(jet[4] == approximately(asin(fp_t(.3))));
+            REQUIRE(jet[5] == approximately(asin(fp_t(.3))));
 
             REQUIRE(jet[6] == approximately(jet[0] + jet[2]));
             REQUIRE(jet[7] == approximately(jet[1] + jet[3]));
@@ -422,13 +422,13 @@ TEST_CASE("taylor asin")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(asin(jet[1])));
             REQUIRE(jet[3] == approximately(asin(jet[0])));
         }
@@ -442,16 +442,16 @@ TEST_CASE("taylor asin")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(asin(jet[2])));
             REQUIRE(jet[5] == approximately(asin(jet[3])));
@@ -469,13 +469,13 @@ TEST_CASE("taylor asin")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(asin(jet[1])));
             REQUIRE(jet[3] == approximately(asin(jet[0])));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * (1 / sqrt(1 - jet[1] * jet[1]) * jet[3])));
@@ -491,16 +491,16 @@ TEST_CASE("taylor asin")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(asin(jet[2])));
             REQUIRE(jet[5] == approximately(asin(jet[3])));
@@ -524,18 +524,18 @@ TEST_CASE("taylor asin")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{-.5}, fp_t{.3}, fp_t{-.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t{-.5}, fp_t(.3), fp_t(-.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
-            REQUIRE(jet[2] == -.5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
+            REQUIRE(jet[2] == fp_t(-.5));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == -.4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(-.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(asin(jet[3])));
             REQUIRE(jet[7] == approximately(asin(jet[4])));
diff --git a/test/taylor_asinh.cpp b/test/taylor_asinh.cpp
index dbff36720..18b6b2bd0 100644
--- a/test/taylor_asinh.cpp
+++ b/test/taylor_asinh.cpp
@@ -40,7 +40,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -265,7 +265,7 @@ TEST_CASE("taylor asinh")
             std::vector<fp_t> jet{fp_t{2}, fp_t{-4}, fp_t{3}, fp_t{5}};
             jet.resize(8);
 
-            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t{.3}, fp_t{.3}};
+            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t(.3), fp_t(.3)};
 
             jptr(jet.data(), pars.data(), nullptr);
 
@@ -275,8 +275,8 @@ TEST_CASE("taylor asinh")
             REQUIRE(jet[2] == 3);
             REQUIRE(jet[3] == 5);
 
-            REQUIRE(jet[4] == approximately(asinh(fp_t{.3})));
-            REQUIRE(jet[5] == approximately(asinh(fp_t{.3})));
+            REQUIRE(jet[4] == approximately(asinh(fp_t(.3))));
+            REQUIRE(jet[5] == approximately(asinh(fp_t(.3))));
 
             REQUIRE(jet[6] == approximately(jet[0] + jet[2]));
             REQUIRE(jet[7] == approximately(jet[1] + jet[3]));
@@ -452,13 +452,13 @@ TEST_CASE("taylor asinh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(asinh(jet[1])));
             REQUIRE(jet[3] == approximately(asinh(jet[0])));
         }
@@ -472,16 +472,16 @@ TEST_CASE("taylor asinh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(asinh(jet[2])));
             REQUIRE(jet[5] == approximately(asinh(jet[3])));
@@ -499,13 +499,13 @@ TEST_CASE("taylor asinh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(asinh(jet[1])));
             REQUIRE(jet[3] == approximately(asinh(jet[0])));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * (1 / sqrt(1 + jet[1] * jet[1]) * jet[3])));
@@ -521,16 +521,16 @@ TEST_CASE("taylor asinh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(asinh(jet[2])));
             REQUIRE(jet[5] == approximately(asinh(jet[3])));
@@ -554,18 +554,18 @@ TEST_CASE("taylor asinh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{-.5}, fp_t{.3}, fp_t{-.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t{-.5}, fp_t(.3), fp_t(-.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
-            REQUIRE(jet[2] == -.5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
+            REQUIRE(jet[2] == fp_t(-.5));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == -.4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(-.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(asinh(jet[3])));
             REQUIRE(jet[7] == approximately(asinh(jet[4])));
diff --git a/test/taylor_atan.cpp b/test/taylor_atan.cpp
index 013d9d577..39597b03d 100644
--- a/test/taylor_atan.cpp
+++ b/test/taylor_atan.cpp
@@ -39,7 +39,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -232,7 +232,7 @@ TEST_CASE("taylor atan")
             std::vector<fp_t> jet{fp_t{2}, fp_t{-4}, fp_t{3}, fp_t{5}};
             jet.resize(8);
 
-            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t{.3}, fp_t{.3}};
+            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t(.3), fp_t(.3)};
 
             jptr(jet.data(), pars.data(), nullptr);
 
@@ -242,8 +242,8 @@ TEST_CASE("taylor atan")
             REQUIRE(jet[2] == 3);
             REQUIRE(jet[3] == 5);
 
-            REQUIRE(jet[4] == approximately(atan(fp_t{.3})));
-            REQUIRE(jet[5] == approximately(atan(fp_t{.3})));
+            REQUIRE(jet[4] == approximately(atan(fp_t(.3))));
+            REQUIRE(jet[5] == approximately(atan(fp_t(.3))));
 
             REQUIRE(jet[6] == approximately(jet[0] + jet[2]));
             REQUIRE(jet[7] == approximately(jet[1] + jet[3]));
@@ -419,13 +419,13 @@ TEST_CASE("taylor atan")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan(jet[1])));
             REQUIRE(jet[3] == approximately(atan(jet[0])));
         }
@@ -439,16 +439,16 @@ TEST_CASE("taylor atan")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(atan(jet[2])));
             REQUIRE(jet[5] == approximately(atan(jet[3])));
@@ -466,13 +466,13 @@ TEST_CASE("taylor atan")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan(jet[1])));
             REQUIRE(jet[3] == approximately(atan(jet[0])));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * (1 / (1 + jet[1] * jet[1]) * jet[3])));
@@ -488,16 +488,16 @@ TEST_CASE("taylor atan")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(atan(jet[2])));
             REQUIRE(jet[5] == approximately(atan(jet[3])));
@@ -521,18 +521,18 @@ TEST_CASE("taylor atan")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{-.5}, fp_t{.3}, fp_t{-.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t{-.5}, fp_t(.3), fp_t(-.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
-            REQUIRE(jet[2] == -.5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
+            REQUIRE(jet[2] == fp_t(-.5));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == -.4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(-.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(atan(jet[3])));
             REQUIRE(jet[7] == approximately(atan(jet[4])));
diff --git a/test/taylor_atan2.cpp b/test/taylor_atan2.cpp
index 9e8677912..980107ad3 100644
--- a/test/taylor_atan2.cpp
+++ b/test/taylor_atan2.cpp
@@ -41,7 +41,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -385,13 +385,13 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(jet[1], a)));
             REQUIRE(jet[3] == approximately(atan2(jet[0], b)));
         }
@@ -406,15 +406,15 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             std::vector<fp_t> pars{a};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(jet[1], a)));
             REQUIRE(jet[3] == approximately(atan2(jet[0], b)));
         }
@@ -429,16 +429,16 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(jet[2], b)));
             REQUIRE(jet[5] == approximately(atan2(jet[3], b)));
@@ -457,18 +457,18 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             std::vector<fp_t> pars{fp_t{0}, fp_t{0}, b, b};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(jet[2], b)));
             REQUIRE(jet[5] == approximately(atan2(jet[3], b)));
@@ -487,13 +487,13 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(jet[1], b)));
             REQUIRE(jet[3] == approximately(atan2(jet[0], b)));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * b * jet[3] / (jet[1] * jet[1] + b * b)));
@@ -510,16 +510,16 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(jet[2], b)));
             REQUIRE(jet[5] == approximately(atan2(jet[3], b)));
@@ -544,18 +544,18 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.1}, fp_t{.3}, fp_t{.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.1), fp_t(.3), fp_t(.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
-            REQUIRE(jet[2] == .1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
+            REQUIRE(jet[2] == fp_t(.1));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == .4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(atan2(jet[3], b)));
             REQUIRE(jet[7] == approximately(atan2(jet[4], b)));
@@ -621,13 +621,13 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(a, jet[1])));
             REQUIRE(jet[3] == approximately(atan2(c, jet[0])));
         }
@@ -642,15 +642,15 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             std::vector<fp_t> pars{a};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(a, jet[1])));
             REQUIRE(jet[3] == approximately(atan2(c, jet[0])));
         }
@@ -665,16 +665,16 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(a, jet[2])));
             REQUIRE(jet[5] == approximately(atan2(a, jet[3])));
@@ -693,18 +693,18 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             std::vector<fp_t> pars{fp_t{0}, fp_t{0}, c, c};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(a, jet[2])));
             REQUIRE(jet[5] == approximately(atan2(a, jet[3])));
@@ -723,13 +723,13 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(a, jet[1])));
             REQUIRE(jet[3] == approximately(atan2(c, jet[0])));
             REQUIRE(jet[4] == approximately(-fp_t{1} / 2 * a * jet[3] / (jet[1] * jet[1] + a * a)));
@@ -746,16 +746,16 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(a, jet[2])));
             REQUIRE(jet[5] == approximately(atan2(a, jet[3])));
@@ -780,18 +780,18 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.1}, fp_t{.3}, fp_t{.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.1), fp_t(.3), fp_t(.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
-            REQUIRE(jet[2] == .1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
+            REQUIRE(jet[2] == fp_t(.1));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == .4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(atan2(a, jet[3])));
             REQUIRE(jet[7] == approximately(atan2(a, jet[4])));
@@ -858,13 +858,13 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(jet[0], jet[1])));
             REQUIRE(jet[3] == approximately(atan2(jet[1], jet[0])));
         }
@@ -878,16 +878,16 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(jet[0], jet[2])));
             REQUIRE(jet[5] == approximately(atan2(jet[1], jet[3])));
@@ -905,13 +905,13 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atan2(jet[0], jet[1])));
             REQUIRE(jet[3] == approximately(atan2(jet[1], jet[0])));
             REQUIRE(jet[4]
@@ -931,16 +931,16 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(atan2(jet[0], jet[2])));
             REQUIRE(jet[5] == approximately(atan2(jet[1], jet[3])));
@@ -972,18 +972,18 @@ TEST_CASE("taylor atan2")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.1}, fp_t{.3}, fp_t{.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.1), fp_t(.3), fp_t(.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
-            REQUIRE(jet[2] == .1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
+            REQUIRE(jet[2] == fp_t(.1));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == .4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(atan2(jet[0], jet[3])));
             REQUIRE(jet[7] == approximately(atan2(jet[1], jet[4])));
diff --git a/test/taylor_atanh.cpp b/test/taylor_atanh.cpp
index d407b4276..405e6747d 100644
--- a/test/taylor_atanh.cpp
+++ b/test/taylor_atanh.cpp
@@ -39,7 +39,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -261,7 +261,7 @@ TEST_CASE("taylor atanh")
             std::vector<fp_t> jet{fp_t{2}, fp_t{-4}, fp_t{3}, fp_t{5}};
             jet.resize(8);
 
-            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t{.3}, fp_t{.3}};
+            std::vector<fp_t> pars{fp_t{.5}, fp_t{.5}, fp_t(.3), fp_t(.3)};
 
             jptr(jet.data(), pars.data(), nullptr);
 
@@ -271,8 +271,8 @@ TEST_CASE("taylor atanh")
             REQUIRE(jet[2] == 3);
             REQUIRE(jet[3] == 5);
 
-            REQUIRE(jet[4] == approximately(atanh(fp_t{.3})));
-            REQUIRE(jet[5] == approximately(atanh(fp_t{.3})));
+            REQUIRE(jet[4] == approximately(atanh(fp_t(.3))));
+            REQUIRE(jet[5] == approximately(atanh(fp_t(.3))));
 
             REQUIRE(jet[6] == approximately(jet[0] + jet[2]));
             REQUIRE(jet[7] == approximately(jet[1] + jet[3]));
@@ -448,13 +448,13 @@ TEST_CASE("taylor atanh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atanh(jet[1])));
             REQUIRE(jet[3] == approximately(atanh(jet[0])));
         }
@@ -468,16 +468,16 @@ TEST_CASE("taylor atanh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(atanh(jet[2])));
             REQUIRE(jet[5] == approximately(atanh(jet[3])));
@@ -495,13 +495,13 @@ TEST_CASE("taylor atanh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(atanh(jet[1])));
             REQUIRE(jet[3] == approximately(atanh(jet[0])));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * (1 / (1 - jet[1] * jet[1]) * jet[3])));
@@ -517,16 +517,16 @@ TEST_CASE("taylor atanh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{.3}, fp_t{-.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t(.3), fp_t(-.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == -.4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(-.4));
 
             REQUIRE(jet[4] == approximately(atanh(jet[2])));
             REQUIRE(jet[5] == approximately(atanh(jet[3])));
@@ -550,18 +550,18 @@ TEST_CASE("taylor atanh")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{-.1}, fp_t{-.5}, fp_t{.3}, fp_t{-.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(-.1), fp_t{-.5}, fp_t(.3), fp_t(-.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == -.1);
-            REQUIRE(jet[2] == -.5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(-.1));
+            REQUIRE(jet[2] == fp_t(-.5));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == -.4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(-.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(atanh(jet[3])));
             REQUIRE(jet[7] == approximately(atanh(jet[4])));
diff --git a/test/taylor_const_sys.cpp b/test/taylor_const_sys.cpp
index e89e65392..6a44a658d 100644
--- a/test/taylor_const_sys.cpp
+++ b/test/taylor_const_sys.cpp
@@ -37,7 +37,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_constants.cpp b/test/taylor_constants.cpp
index 4149da3b0..35e0e6752 100644
--- a/test/taylor_constants.cpp
+++ b/test/taylor_constants.cpp
@@ -39,7 +39,7 @@ using namespace heyoka;
 using namespace heyoka_test;
 namespace hy = heyoka;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_div.cpp b/test/taylor_div.cpp
index 9690e80a7..ca6f6e89f 100644
--- a/test/taylor_div.cpp
+++ b/test/taylor_div.cpp
@@ -40,7 +40,7 @@ auto div_wrapper(expression a, expression b)
     return detail::div(std::move(a), std::move(b));
 }
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_erf.cpp b/test/taylor_erf.cpp
index d19a37f41..92b49e910 100644
--- a/test/taylor_erf.cpp
+++ b/test/taylor_erf.cpp
@@ -42,7 +42,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -517,8 +517,8 @@ TEST_CASE("taylor erf")
             REQUIRE(jet[1] == 3);
             REQUIRE(jet[2] == approximately(erf(jet[1])));
             REQUIRE(jet[3] == approximately(erf(jet[0])));
-            REQUIRE(jet[4] == approximately(fp_t{1} / 2. * ((2. / sqrt(pi) * exp(-jet[1] * jet[1])) * jet[3])));
-            REQUIRE(jet[5] == approximately(fp_t{1} / 2. * ((2. / sqrt(pi) * exp(-jet[0] * jet[0])) * jet[2])));
+            REQUIRE(jet[4] == approximately(fp_t{1} / 2 * ((2 / sqrt(pi) * exp(-jet[1] * jet[1])) * jet[3])));
+            REQUIRE(jet[5] == approximately(fp_t{1} / 2 * ((2 / sqrt(pi) * exp(-jet[0] * jet[0])) * jet[2])));
         }
 
         if constexpr (!std::is_same_v<long double, fp_t> || !skip_batch_ld) {
@@ -547,11 +547,11 @@ TEST_CASE("taylor erf")
             REQUIRE(jet[6] == approximately(erf(jet[0])));
             REQUIRE(jet[7] == approximately(erf(jet[1])));
 
-            REQUIRE(jet[8] == approximately(fp_t{1} / 2 * ((2. / sqrt(pi) * exp(-jet[2] * jet[2])) * jet[6])));
-            REQUIRE(jet[9] == approximately(fp_t{1} / 2 * ((2. / sqrt(pi) * exp(-jet[3] * jet[3])) * jet[7])));
+            REQUIRE(jet[8] == approximately(fp_t{1} / 2 * ((2 / sqrt(pi) * exp(-jet[2] * jet[2])) * jet[6])));
+            REQUIRE(jet[9] == approximately(fp_t{1} / 2 * ((2 / sqrt(pi) * exp(-jet[3] * jet[3])) * jet[7])));
 
-            REQUIRE(jet[10] == approximately(fp_t{1} / 2 * ((2. / sqrt(pi) * exp(-jet[0] * jet[0])) * jet[4])));
-            REQUIRE(jet[11] == approximately(fp_t{1} / 2 * ((2. / sqrt(pi) * exp(-jet[1] * jet[1])) * jet[5])));
+            REQUIRE(jet[10] == approximately(fp_t{1} / 2 * ((2 / sqrt(pi) * exp(-jet[0] * jet[0])) * jet[4])));
+            REQUIRE(jet[11] == approximately(fp_t{1} / 2 * ((2 / sqrt(pi) * exp(-jet[1] * jet[1])) * jet[5])));
         }
 
         if constexpr (!std::is_same_v<long double, fp_t> || !skip_batch_ld) {
@@ -584,39 +584,39 @@ TEST_CASE("taylor erf")
             REQUIRE(jet[10] == approximately(erf(jet[1])));
             REQUIRE(jet[11] == approximately(erf(jet[2])));
 
-            REQUIRE(jet[12] == approximately(fp_t{1} / 2 * (2. / sqrt(pi) * exp(-jet[3] * jet[3]) * jet[9])));
-            REQUIRE(jet[13] == approximately(fp_t{1} / 2 * (2. / sqrt(pi) * exp(-jet[4] * jet[4]) * jet[10])));
-            REQUIRE(jet[14] == approximately(fp_t{1} / 2 * (2. / sqrt(pi) * exp(-jet[5] * jet[5]) * jet[11])));
+            REQUIRE(jet[12] == approximately(fp_t{1} / 2 * (2 / sqrt(pi) * exp(-jet[3] * jet[3]) * jet[9])));
+            REQUIRE(jet[13] == approximately(fp_t{1} / 2 * (2 / sqrt(pi) * exp(-jet[4] * jet[4]) * jet[10])));
+            REQUIRE(jet[14] == approximately(fp_t{1} / 2 * (2 / sqrt(pi) * exp(-jet[5] * jet[5]) * jet[11])));
 
-            REQUIRE(jet[15] == approximately(fp_t{1} / 2 * (2. / sqrt(pi) * exp(-jet[0] * jet[0]) * jet[6])));
-            REQUIRE(jet[16] == approximately(fp_t{1} / 2 * (2. / sqrt(pi) * exp(-jet[1] * jet[1]) * jet[7])));
-            REQUIRE(jet[17] == approximately(fp_t{1} / 2 * (2. / sqrt(pi) * exp(-jet[2] * jet[2]) * jet[8])));
+            REQUIRE(jet[15] == approximately(fp_t{1} / 2 * (2 / sqrt(pi) * exp(-jet[0] * jet[0]) * jet[6])));
+            REQUIRE(jet[16] == approximately(fp_t{1} / 2 * (2 / sqrt(pi) * exp(-jet[1] * jet[1]) * jet[7])));
+            REQUIRE(jet[17] == approximately(fp_t{1} / 2 * (2 / sqrt(pi) * exp(-jet[2] * jet[2]) * jet[8])));
 
             REQUIRE(jet[18]
-                    == approximately(fp_t{1} / 6 * 2. / sqrt(pi)
-                                     * (-2. * exp(-jet[3] * jet[3]) * jet[3] * jet[9] * jet[9]
-                                        + exp(-jet[3] * jet[3]) * 2. / sqrt(pi) * exp(-jet[0] * jet[0]) * jet[6])));
+                    == approximately(fp_t{1} / 6 * 2 / sqrt(pi)
+                                     * (-2 * exp(-jet[3] * jet[3]) * jet[3] * jet[9] * jet[9]
+                                        + exp(-jet[3] * jet[3]) * 2 / sqrt(pi) * exp(-jet[0] * jet[0]) * jet[6])));
             REQUIRE(jet[19]
-                    == approximately(fp_t{1} / 6 * 2. / sqrt(pi)
-                                     * (-2. * exp(-jet[4] * jet[4]) * jet[4] * jet[10] * jet[10]
-                                        + exp(-jet[4] * jet[4]) * 2. / sqrt(pi) * exp(-jet[1] * jet[1]) * jet[7])));
+                    == approximately(fp_t{1} / 6 * 2 / sqrt(pi)
+                                     * (-2 * exp(-jet[4] * jet[4]) * jet[4] * jet[10] * jet[10]
+                                        + exp(-jet[4] * jet[4]) * 2 / sqrt(pi) * exp(-jet[1] * jet[1]) * jet[7])));
             REQUIRE(jet[20]
-                    == approximately(fp_t{1} / 6 * 2. / sqrt(pi)
-                                     * (-2. * exp(-jet[5] * jet[5]) * jet[5] * jet[11] * jet[11]
-                                        + exp(-jet[5] * jet[5]) * 2. / sqrt(pi) * exp(-jet[2] * jet[2]) * jet[8])));
+                    == approximately(fp_t{1} / 6 * 2 / sqrt(pi)
+                                     * (-2 * exp(-jet[5] * jet[5]) * jet[5] * jet[11] * jet[11]
+                                        + exp(-jet[5] * jet[5]) * 2 / sqrt(pi) * exp(-jet[2] * jet[2]) * jet[8])));
 
             REQUIRE(jet[21]
-                    == approximately(fp_t{1} / 6 * 2. / sqrt(pi)
-                                     * (-2. * exp(-jet[0] * jet[0]) * jet[0] * jet[6] * jet[6]
-                                        + exp(-jet[0] * jet[0]) * 2. / sqrt(pi) * exp(-jet[3] * jet[3]) * jet[9])));
+                    == approximately(fp_t{1} / 6 * 2 / sqrt(pi)
+                                     * (-2 * exp(-jet[0] * jet[0]) * jet[0] * jet[6] * jet[6]
+                                        + exp(-jet[0] * jet[0]) * 2 / sqrt(pi) * exp(-jet[3] * jet[3]) * jet[9])));
             REQUIRE(jet[22]
-                    == approximately(fp_t{1} / 6 * 2. / sqrt(pi)
-                                     * (-2. * exp(-jet[1] * jet[1]) * jet[1] * jet[7] * jet[7]
-                                        + exp(-jet[1] * jet[1]) * 2. / sqrt(pi) * exp(-jet[4] * jet[4]) * jet[10])));
+                    == approximately(fp_t{1} / 6 * 2 / sqrt(pi)
+                                     * (-2 * exp(-jet[1] * jet[1]) * jet[1] * jet[7] * jet[7]
+                                        + exp(-jet[1] * jet[1]) * 2 / sqrt(pi) * exp(-jet[4] * jet[4]) * jet[10])));
             REQUIRE(jet[23]
-                    == approximately(fp_t{1} / 6 * 2. / sqrt(pi)
-                                     * (-2. * exp(-jet[2] * jet[2]) * jet[2] * jet[8] * jet[8]
-                                        + exp(-jet[2] * jet[2]) * 2. / sqrt(pi) * exp(-jet[5] * jet[5]) * jet[11])));
+                    == approximately(fp_t{1} / 6 * 2 / sqrt(pi)
+                                     * (-2 * exp(-jet[2] * jet[2]) * jet[2] * jet[8] * jet[8]
+                                        + exp(-jet[2] * jet[2]) * 2 / sqrt(pi) * exp(-jet[5] * jet[5]) * jet[11])));
         }
 
         if constexpr (!std::is_same_v<long double, fp_t> || !skip_batch_ld) {
diff --git a/test/taylor_exp.cpp b/test/taylor_exp.cpp
index 8619d7648..3820174e8 100644
--- a/test/taylor_exp.cpp
+++ b/test/taylor_exp.cpp
@@ -35,7 +35,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_kepE.cpp b/test/taylor_kepE.cpp
index 5051f703a..668863b2f 100644
--- a/test/taylor_kepE.cpp
+++ b/test/taylor_kepE.cpp
@@ -42,7 +42,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -368,13 +368,13 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(jet[1], a)));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(jet[0], b)));
         }
@@ -389,15 +389,15 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             std::vector<fp_t> pars{a};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(jet[1], a)));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(jet[0], b)));
         }
@@ -412,16 +412,16 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(jet[2], b)));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(jet[3], b)));
@@ -440,18 +440,18 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             std::vector<fp_t> pars{fp_t{0}, fp_t{0}, b, b};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(jet[2], b)));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(jet[3], b)));
@@ -470,13 +470,13 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(jet[1], b)));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(jet[0], b)));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * jet[3] * sin(jet[2]) / (1 - jet[1] * cos(jet[2]))));
@@ -493,16 +493,16 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(jet[2], b)));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(jet[3], b)));
@@ -527,18 +527,18 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.1}, fp_t{.3}, fp_t{.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.1), fp_t(.3), fp_t(.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
-            REQUIRE(jet[2] == .1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
+            REQUIRE(jet[2] == fp_t(.1));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == .4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(bmt_inv_kep_E(jet[3], b)));
             REQUIRE(jet[7] == approximately(bmt_inv_kep_E(jet[4], b)));
@@ -626,13 +626,13 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(a, jet[1])));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(c, jet[0])));
         }
@@ -647,15 +647,15 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             std::vector<fp_t> pars{a};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(a, jet[1])));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(c, jet[0])));
         }
@@ -670,16 +670,16 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(a, jet[2])));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(a, jet[3])));
@@ -698,18 +698,18 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             std::vector<fp_t> pars{fp_t{0}, fp_t{0}, c, c};
 
             jptr(jet.data(), pars.data(), nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(a, jet[2])));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(a, jet[3])));
@@ -728,13 +728,13 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(a, jet[1])));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(c, jet[0])));
             REQUIRE(jet[4] == approximately(fp_t{1} / 2 * jet[3] / (1 - a * cos(jet[2]))));
@@ -751,16 +751,16 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(a, jet[2])));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(a, jet[3])));
@@ -785,18 +785,18 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.1}, fp_t{.3}, fp_t{.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.1), fp_t(.3), fp_t(.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
-            REQUIRE(jet[2] == .1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
+            REQUIRE(jet[2] == fp_t(.1));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == .4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(bmt_inv_kep_E(a, jet[3])));
             REQUIRE(jet[7] == approximately(bmt_inv_kep_E(a, jet[4])));
@@ -859,13 +859,13 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(4);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == fp_t{.2});
-            REQUIRE(jet[1] == fp_t{.3});
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(jet[0], jet[1])));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(jet[1], jet[0])));
         }
@@ -879,16 +879,16 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(8);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(jet[0], jet[2])));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(jet[1], jet[3])));
@@ -906,13 +906,13 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.3}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t(.3)};
             jet.resize(6);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .3);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.3));
             REQUIRE(jet[2] == approximately(bmt_inv_kep_E(jet[0], jet[1])));
             REQUIRE(jet[3] == approximately(bmt_inv_kep_E(jet[1], jet[0])));
             REQUIRE(jet[4]
@@ -930,16 +930,16 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.3}, fp_t{.4}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.3), fp_t(.4)};
             jet.resize(12);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
 
-            REQUIRE(jet[2] == .3);
-            REQUIRE(jet[3] == .4);
+            REQUIRE(jet[2] == fp_t(.3));
+            REQUIRE(jet[3] == fp_t(.4));
 
             REQUIRE(jet[4] == approximately(bmt_inv_kep_E(jet[0], jet[2])));
             REQUIRE(jet[5] == approximately(bmt_inv_kep_E(jet[1], jet[3])));
@@ -967,18 +967,18 @@ TEST_CASE("taylor kepE")
 
             auto jptr = reinterpret_cast<void (*)(fp_t *, const fp_t *, const fp_t *)>(s.jit_lookup("jet"));
 
-            std::vector<fp_t> jet{fp_t{.2}, fp_t{.5}, fp_t{.1}, fp_t{.3}, fp_t{.4}, fp_t{.6}};
+            std::vector<fp_t> jet{fp_t(.2), fp_t{.5}, fp_t(.1), fp_t(.3), fp_t(.4), fp_t(.6)};
             jet.resize(24);
 
             jptr(jet.data(), nullptr, nullptr);
 
-            REQUIRE(jet[0] == .2);
-            REQUIRE(jet[1] == .5);
-            REQUIRE(jet[2] == .1);
+            REQUIRE(jet[0] == fp_t(.2));
+            REQUIRE(jet[1] == fp_t(.5));
+            REQUIRE(jet[2] == fp_t(.1));
 
-            REQUIRE(jet[3] == .3);
-            REQUIRE(jet[4] == .4);
-            REQUIRE(jet[5] == .6);
+            REQUIRE(jet[3] == fp_t(.3));
+            REQUIRE(jet[4] == fp_t(.4));
+            REQUIRE(jet[5] == fp_t(.6));
 
             REQUIRE(jet[6] == approximately(bmt_inv_kep_E(jet[0], jet[3])));
             REQUIRE(jet[7] == approximately(bmt_inv_kep_E(jet[1], jet[4])));
diff --git a/test/taylor_kepF.cpp b/test/taylor_kepF.cpp
index 8377426de..e9fe741fd 100644
--- a/test/taylor_kepF.cpp
+++ b/test/taylor_kepF.cpp
@@ -31,7 +31,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_log.cpp b/test/taylor_log.cpp
index 46f6c273d..6a9524795 100644
--- a/test/taylor_log.cpp
+++ b/test/taylor_log.cpp
@@ -35,7 +35,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_mul.cpp b/test/taylor_mul.cpp
index 506407b75..d9648749a 100644
--- a/test/taylor_mul.cpp
+++ b/test/taylor_mul.cpp
@@ -34,7 +34,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -238,8 +238,8 @@ TEST_CASE("taylor mul")
             REQUIRE(jet[7] == approximately(-fp_t{5}));
             REQUIRE(jet[8] == 0);
             REQUIRE(jet[9] == 0);
-            REQUIRE(jet[10] == approximately(.5 * (fp_t{6} + jet[6])));
-            REQUIRE(jet[11] == approximately(.5 * (fp_t{6} + jet[7])));
+            REQUIRE(jet[10] == approximately(fp_t(.5) * (fp_t{6} + jet[6])));
+            REQUIRE(jet[11] == approximately(fp_t(.5) * (fp_t{6} + jet[7])));
         }
 
         {
diff --git a/test/taylor_neg.cpp b/test/taylor_neg.cpp
index 097e47c1b..07fb38af6 100644
--- a/test/taylor_neg.cpp
+++ b/test/taylor_neg.cpp
@@ -34,7 +34,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_no_decomp_sys.cpp b/test/taylor_no_decomp_sys.cpp
index e1f477e80..2ddaa5cd2 100644
--- a/test/taylor_no_decomp_sys.cpp
+++ b/test/taylor_no_decomp_sys.cpp
@@ -35,7 +35,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_nt_event.cpp b/test/taylor_nt_event.cpp
index e2625b5c6..82d064786 100644
--- a/test/taylor_nt_event.cpp
+++ b/test/taylor_nt_event.cpp
@@ -9,7 +9,6 @@
 #include <heyoka/config.hpp>
 
 #include <cmath>
-#include <cstddef>
 #include <initializer_list>
 #include <iostream>
 #include <limits>
@@ -42,7 +41,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -320,14 +319,17 @@ TEST_CASE("taylor nte multizero")
 
         fp_t cur_time(0);
 
+        // NOTE: don't make the small delta too smal in single-precision.
+        const auto small_delta = std::is_same_v<fp_t, float> ? 1e-6 : 1e-10;
+
         auto ta = taylor_adaptive<fp_t>{
             {prime(x) = v, prime(v) = -9.8 * sin(x)},
             {fp_t(0), fp_t(.25)},
             kw::opt_level = opt_level,
             kw::high_accuracy = high_accuracy,
             kw::compact_mode = compact_mode,
-            kw::nt_events = {ev_t(v * v - 1e-10,
-                                  [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
+            kw::nt_events = {ev_t(v * v - small_delta,
+                                  [&counter, &cur_time, small_delta](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
                                       using std::abs;
 
                                       // Make sure the callbacks are called in order.
@@ -343,7 +345,7 @@ TEST_CASE("taylor nte multizero")
                                       ta_.update_d_output(t);
 
                                       const auto vel = ta_.get_d_output()[1];
-                                      REQUIRE(abs(vel * vel - 1e-10) < std::numeric_limits<fp_t>::epsilon());
+                                      REQUIRE(abs(vel * vel - small_delta) < std::numeric_limits<fp_t>::epsilon());
 
                                       ++counter;
 
@@ -380,58 +382,58 @@ TEST_CASE("taylor nte multizero")
         cur_time = 0;
 
         // Run the same test with sub-eps tolerance too.
-        ta = taylor_adaptive<fp_t>{{prime(x) = v, prime(v) = -9.8 * sin(x)},
-                                   {fp_t(0), fp_t(.25)},
-                                   kw::tol = std::numeric_limits<fp_t>::epsilon() / 100,
-                                   kw::opt_level = opt_level,
-                                   kw::high_accuracy = high_accuracy,
-                                   kw::compact_mode = compact_mode,
-                                   kw::nt_events
-                                   = {ev_t(v * v - 1e-10,
-                                           [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
-                                               using std::abs;
+        ta = taylor_adaptive<fp_t>{
+            {prime(x) = v, prime(v) = -9.8 * sin(x)},
+            {fp_t(0), fp_t(.25)},
+            kw::tol = std::numeric_limits<fp_t>::epsilon() / 100,
+            kw::opt_level = opt_level,
+            kw::high_accuracy = high_accuracy,
+            kw::compact_mode = compact_mode,
+            kw::nt_events = {ev_t(v * v - small_delta,
+                                  [&counter, &cur_time, small_delta](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
+                                      using std::abs;
 
-                                               // Make sure the callbacks are called in order.
-                                               REQUIRE(t > cur_time);
+                                      // Make sure the callbacks are called in order.
+                                      REQUIRE(t > cur_time);
 
-                                               // Ensure the state of ta has
-                                               // been propagated until after the
-                                               // event.
-                                               REQUIRE(ta_.get_time() > t);
+                                      // Ensure the state of ta has
+                                      // been propagated until after the
+                                      // event.
+                                      REQUIRE(ta_.get_time() > t);
 
-                                               REQUIRE((counter % 3u == 0u || counter % 3u == 2u));
+                                      REQUIRE((counter % 3u == 0u || counter % 3u == 2u));
 
-                                               ta_.update_d_output(t);
+                                      ta_.update_d_output(t);
 
-                                               const auto vel = ta_.get_d_output()[1];
-                                               REQUIRE(abs(vel * vel - 1e-10) < std::numeric_limits<fp_t>::epsilon());
+                                      const auto vel = ta_.get_d_output()[1];
+                                      REQUIRE(abs(vel * vel - small_delta) < std::numeric_limits<fp_t>::epsilon());
 
-                                               ++counter;
+                                      ++counter;
 
-                                               cur_time = t;
-                                           }),
-                                      ev_t(v, [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
-                                          using std::abs;
+                                      cur_time = t;
+                                  }),
+                             ev_t(v, [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
+                                 using std::abs;
 
-                                          // Make sure the callbacks are called in order.
-                                          REQUIRE(t > cur_time);
+                                 // Make sure the callbacks are called in order.
+                                 REQUIRE(t > cur_time);
 
-                                          // Ensure the state of ta has
-                                          // been propagated until after the
-                                          // event.
-                                          REQUIRE(ta_.get_time() > t);
+                                 // Ensure the state of ta has
+                                 // been propagated until after the
+                                 // event.
+                                 REQUIRE(ta_.get_time() > t);
 
-                                          REQUIRE((counter % 3u == 1u));
+                                 REQUIRE((counter % 3u == 1u));
 
-                                          ta_.update_d_output(t);
+                                 ta_.update_d_output(t);
 
-                                          const auto vel = ta_.get_d_output()[1];
-                                          REQUIRE(abs(vel) <= std::numeric_limits<fp_t>::epsilon() * 100);
+                                 const auto vel = ta_.get_d_output()[1];
+                                 REQUIRE(abs(vel) <= std::numeric_limits<fp_t>::epsilon() * 100);
 
-                                          ++counter;
+                                 ++counter;
 
-                                          cur_time = t;
-                                      })}};
+                                 cur_time = t;
+                             })}};
 
         REQUIRE(std::get<0>(ta.propagate_until(fp_t(4))) == taylor_outcome::time_limit);
 
@@ -447,64 +449,64 @@ TEST_CASE("taylor nte multizero")
         // - 0 0
         // - 0 1 0
         // - 0 0
-        ta = taylor_adaptive<fp_t>{{prime(x) = v, prime(v) = -9.8 * sin(x)},
-                                   {fp_t(0), fp_t(.25)},
-                                   kw::opt_level = opt_level,
-                                   kw::high_accuracy = high_accuracy,
-                                   kw::compact_mode = compact_mode,
-                                   kw::nt_events
-                                   = {ev_t(v * v - 1e-10,
-                                           [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
-                                               using std::abs;
+        ta = taylor_adaptive<fp_t>{
+            {prime(x) = v, prime(v) = -9.8 * sin(x)},
+            {fp_t(0), fp_t(.25)},
+            kw::opt_level = opt_level,
+            kw::high_accuracy = high_accuracy,
+            kw::compact_mode = compact_mode,
+            kw::nt_events = {ev_t(v * v - small_delta,
+                                  [&counter, &cur_time, small_delta](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
+                                      using std::abs;
 
-                                               // Make sure the callbacks are called in order.
-                                               REQUIRE(t > cur_time);
+                                      // Make sure the callbacks are called in order.
+                                      REQUIRE(t > cur_time);
 
-                                               // Ensure the state of ta has
-                                               // been propagated until after the
-                                               // event.
-                                               REQUIRE(ta_.get_time() > t);
+                                      // Ensure the state of ta has
+                                      // been propagated until after the
+                                      // event.
+                                      REQUIRE(ta_.get_time() > t);
 
-                                               REQUIRE((counter == 0u || (counter >= 2u && counter <= 6u)
-                                                        || (counter >= 7u && counter <= 9u)));
+                                      REQUIRE((counter == 0u || (counter >= 2u && counter <= 6u)
+                                               || (counter >= 7u && counter <= 9u)));
 
-                                               ta_.update_d_output(t);
+                                      ta_.update_d_output(t);
 
-                                               const auto vel = ta_.get_d_output()[1];
-                                               REQUIRE(abs(vel * vel - 1e-10) < std::numeric_limits<fp_t>::epsilon());
+                                      const auto vel = ta_.get_d_output()[1];
+                                      REQUIRE(abs(vel * vel - small_delta) < std::numeric_limits<fp_t>::epsilon());
 
-                                               ++counter;
+                                      ++counter;
 
-                                               cur_time = t;
-                                           }),
-                                      ev_t(
-                                          v,
+                                      cur_time = t;
+                                  }),
+                             ev_t(
+                                 v,
 
-                                          [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int d_sgn) {
-                                              using std::abs;
+                                 [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int d_sgn) {
+                                     using std::abs;
 
-                                              REQUIRE(d_sgn == -1);
+                                     REQUIRE(d_sgn == -1);
 
-                                              // Make sure the callbacks are called in order.
-                                              REQUIRE(t > cur_time);
+                                     // Make sure the callbacks are called in order.
+                                     REQUIRE(t > cur_time);
 
-                                              // Ensure the state of ta has
-                                              // been propagated until after the
-                                              // event.
-                                              REQUIRE(ta_.get_time() > t);
+                                     // Ensure the state of ta has
+                                     // been propagated until after the
+                                     // event.
+                                     REQUIRE(ta_.get_time() > t);
 
-                                              REQUIRE((counter == 1u || counter == 6u));
+                                     REQUIRE((counter == 1u || counter == 6u));
 
-                                              ta_.update_d_output(t);
+                                     ta_.update_d_output(t);
 
-                                              const auto vel = ta_.get_d_output()[1];
-                                              REQUIRE(abs(vel) <= std::numeric_limits<fp_t>::epsilon() * 100);
+                                     const auto vel = ta_.get_d_output()[1];
+                                     REQUIRE(abs(vel) <= std::numeric_limits<fp_t>::epsilon() * 100);
 
-                                              ++counter;
+                                     ++counter;
 
-                                              cur_time = t;
-                                          },
-                                          kw::direction = event_direction::negative)}};
+                                     cur_time = t;
+                                 },
+                                 kw::direction = event_direction::negative)}};
 
         REQUIRE(std::get<0>(ta.propagate_until(fp_t(4))) == taylor_outcome::time_limit);
 
@@ -514,65 +516,65 @@ TEST_CASE("taylor nte multizero")
         cur_time = 0;
 
         // Sub-eps tolerance too.
-        ta = taylor_adaptive<fp_t>{{prime(x) = v, prime(v) = -9.8 * sin(x)},
-                                   {fp_t(0), fp_t(.25)},
-                                   kw::tol = std::numeric_limits<fp_t>::epsilon() / 100,
-                                   kw::opt_level = opt_level,
-                                   kw::high_accuracy = high_accuracy,
-                                   kw::compact_mode = compact_mode,
-                                   kw::nt_events
-                                   = {ev_t(v * v - 1e-10,
-                                           [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
-                                               using std::abs;
+        ta = taylor_adaptive<fp_t>{
+            {prime(x) = v, prime(v) = -9.8 * sin(x)},
+            {fp_t(0), fp_t(.25)},
+            kw::tol = std::numeric_limits<fp_t>::epsilon() / 100,
+            kw::opt_level = opt_level,
+            kw::high_accuracy = high_accuracy,
+            kw::compact_mode = compact_mode,
+            kw::nt_events = {ev_t(v * v - small_delta,
+                                  [&counter, &cur_time, small_delta](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
+                                      using std::abs;
 
-                                               // Make sure the callbacks are called in order.
-                                               REQUIRE(t > cur_time);
+                                      // Make sure the callbacks are called in order.
+                                      REQUIRE(t > cur_time);
 
-                                               // Ensure the state of ta has
-                                               // been propagated until after the
-                                               // event.
-                                               REQUIRE(ta_.get_time() > t);
+                                      // Ensure the state of ta has
+                                      // been propagated until after the
+                                      // event.
+                                      REQUIRE(ta_.get_time() > t);
 
-                                               REQUIRE((counter == 0u || (counter >= 2u && counter <= 6u)
-                                                        || (counter >= 7u && counter <= 9u)));
+                                      REQUIRE((counter == 0u || (counter >= 2u && counter <= 6u)
+                                               || (counter >= 7u && counter <= 9u)));
 
-                                               ta_.update_d_output(t);
+                                      ta_.update_d_output(t);
 
-                                               const auto vel = ta_.get_d_output()[1];
-                                               REQUIRE(abs(vel * vel - 1e-10) < std::numeric_limits<fp_t>::epsilon());
+                                      const auto vel = ta_.get_d_output()[1];
+                                      REQUIRE(abs(vel * vel - small_delta) < std::numeric_limits<fp_t>::epsilon());
 
-                                               ++counter;
+                                      ++counter;
 
-                                               cur_time = t;
-                                           }),
-                                      ev_t(
-                                          v,
+                                      cur_time = t;
+                                  }),
+                             ev_t(
+                                 v,
 
-                                          [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int d_sgn) {
-                                              using std::abs;
+                                 [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int d_sgn) {
+                                     using std::abs;
 
-                                              REQUIRE(d_sgn == -1);
+                                     REQUIRE(d_sgn == -1);
 
-                                              // Make sure the callbacks are called in order.
-                                              REQUIRE(t > cur_time);
+                                     // Make sure the callbacks are called in order.
+                                     REQUIRE(t > cur_time);
 
-                                              // Ensure the state of ta has
-                                              // been propagated until after the
-                                              // event.
-                                              REQUIRE(ta_.get_time() > t);
+                                     // Ensure the state of ta has
+                                     // been propagated until after the
+                                     // event.
+                                     REQUIRE(ta_.get_time() > t);
 
-                                              REQUIRE((counter == 1u || counter == 6u));
+                                     REQUIRE((counter == 1u || counter == 6u));
 
-                                              ta_.update_d_output(t);
+                                     ta_.update_d_output(t);
 
-                                              const auto vel = ta_.get_d_output()[1];
-                                              REQUIRE(abs(vel) <= std::numeric_limits<fp_t>::epsilon() * 100);
+                                     const auto vel = ta_.get_d_output()[1];
+                                     REQUIRE(abs(vel) <= std::numeric_limits<fp_t>::epsilon() * 100);
 
-                                              ++counter;
+                                     ++counter;
 
-                                              cur_time = t;
-                                          },
-                                          kw::direction = event_direction::negative)}};
+                                     cur_time = t;
+                                 },
+                                 kw::direction = event_direction::negative)}};
 
         REQUIRE(std::get<0>(ta.propagate_until(fp_t(4))) == taylor_outcome::time_limit);
 
@@ -602,6 +604,9 @@ TEST_CASE("taylor nte multizero negative timestep")
 
         fp_t cur_time(0);
 
+        // NOTE: don't make the small delta too smal in single-precision.
+        const auto small_delta = std::is_same_v<fp_t, float> ? 1e-6 : 1e-10;
+
         // In this test, we define two events:
         // - the velocity is smaller in absolute
         //   value than a small limit,
@@ -616,8 +621,8 @@ TEST_CASE("taylor nte multizero negative timestep")
             kw::opt_level = opt_level,
             kw::high_accuracy = high_accuracy,
             kw::compact_mode = compact_mode,
-            kw::nt_events = {ev_t(v * v - 1e-10,
-                                  [&counter, &cur_time](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
+            kw::nt_events = {ev_t(v * v - small_delta,
+                                  [&counter, &cur_time, small_delta](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
                                       using std::abs;
 
                                       // Make sure the callbacks are called in order.
@@ -633,7 +638,7 @@ TEST_CASE("taylor nte multizero negative timestep")
                                       ta_.update_d_output(t);
 
                                       const auto vel = ta_.get_d_output()[1];
-                                      REQUIRE(abs(vel * vel - 1e-10) < std::numeric_limits<fp_t>::epsilon());
+                                      REQUIRE(abs(vel * vel - small_delta) < std::numeric_limits<fp_t>::epsilon());
 
                                       ++counter;
 
@@ -788,6 +793,7 @@ struct s11n_callback {
     }
 };
 
+HEYOKA_S11N_CALLABLE_EXPORT(s11n_callback, void, taylor_adaptive<float> &, float, int)
 HEYOKA_S11N_CALLABLE_EXPORT(s11n_callback, void, taylor_adaptive<double> &, double, int)
 HEYOKA_S11N_CALLABLE_EXPORT(s11n_callback, void, taylor_adaptive<long double> &, long double, int)
 
diff --git a/test/taylor_pow.cpp b/test/taylor_pow.cpp
index 3344b04c3..eded5ea72 100644
--- a/test/taylor_pow.cpp
+++ b/test/taylor_pow.cpp
@@ -40,7 +40,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_prod.cpp b/test/taylor_prod.cpp
index d80eea668..0726710c5 100644
--- a/test/taylor_prod.cpp
+++ b/test/taylor_prod.cpp
@@ -37,7 +37,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -301,8 +301,8 @@ TEST_CASE("taylor mul")
             REQUIRE(jet[7] == approximately(-fp_t{5}));
             REQUIRE(jet[8] == 0);
             REQUIRE(jet[9] == 0);
-            REQUIRE(jet[10] == approximately(.5 * (fp_t{6} + jet[6])));
-            REQUIRE(jet[11] == approximately(.5 * (fp_t{6} + jet[7])));
+            REQUIRE(jet[10] == approximately(fp_t(.5) * (fp_t{6} + jet[6])));
+            REQUIRE(jet[11] == approximately(fp_t(.5) * (fp_t{6} + jet[7])));
         }
 
         {
diff --git a/test/taylor_relu.cpp b/test/taylor_relu.cpp
index 9a32085fc..b3f9daca9 100644
--- a/test/taylor_relu.cpp
+++ b/test/taylor_relu.cpp
@@ -30,7 +30,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_sigmoid.cpp b/test/taylor_sigmoid.cpp
index 1d3fd2a28..c97879d4a 100644
--- a/test/taylor_sigmoid.cpp
+++ b/test/taylor_sigmoid.cpp
@@ -39,7 +39,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -97,6 +97,11 @@ void compare_batch_scalar(std::initializer_list<U> sys, unsigned opt_level, bool
     }
 }
 
+float sigmoid(float x)
+{
+    return 1 / (1 + std::exp(-x));
+}
+
 double sigmoid(double x)
 {
     return 1. / (1. + std::exp(-x));
diff --git a/test/taylor_sincos.cpp b/test/taylor_sincos.cpp
index f9a59cd97..8efebd5fd 100644
--- a/test/taylor_sincos.cpp
+++ b/test/taylor_sincos.cpp
@@ -35,7 +35,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_sinhcosh.cpp b/test/taylor_sinhcosh.cpp
index d36919514..0ff853967 100644
--- a/test/taylor_sinhcosh.cpp
+++ b/test/taylor_sinhcosh.cpp
@@ -39,7 +39,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -91,7 +91,7 @@ void compare_batch_scalar(std::initializer_list<U> sys, unsigned opt_level, bool
             jptr_scalar(jet_scalar.data(), nullptr, nullptr);
 
             for (auto i = 2u; i < 8u; ++i) {
-                REQUIRE(jet_scalar[i] == approximately(jet_batch[i * batch_size + batch_idx]));
+                REQUIRE(jet_scalar[i] == approximately(jet_batch[i * batch_size + batch_idx], T(1000)));
             }
         }
     }
diff --git a/test/taylor_sqrt.cpp b/test/taylor_sqrt.cpp
index 08f8022c1..7bd95285c 100644
--- a/test/taylor_sqrt.cpp
+++ b/test/taylor_sqrt.cpp
@@ -35,7 +35,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_square.cpp b/test/taylor_square.cpp
index c8689e7b2..bc53d0ae2 100644
--- a/test/taylor_square.cpp
+++ b/test/taylor_square.cpp
@@ -38,7 +38,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_sub.cpp b/test/taylor_sub.cpp
index ae45ce800..24c55786d 100644
--- a/test/taylor_sub.cpp
+++ b/test/taylor_sub.cpp
@@ -38,7 +38,7 @@ auto sub_wrapper(expression a, expression b)
     return detail::sub(std::move(a), std::move(b));
 }
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -260,8 +260,8 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[7] == approximately(-fp_t{5}));
             REQUIRE(jet[8] == 0);
             REQUIRE(jet[9] == 0);
-            REQUIRE(jet[10] == approximately(.5 * (jet[4] + jet[6])));
-            REQUIRE(jet[11] == approximately(.5 * (jet[5] + jet[7])));
+            REQUIRE(jet[10] == approximately(fp_t(.5) * (jet[4] + jet[6])));
+            REQUIRE(jet[11] == approximately(fp_t(.5) * (jet[5] + jet[7])));
         }
 
         {
@@ -471,8 +471,8 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[1] == 3);
             REQUIRE(jet[2] == approximately(jet[1] - 2));
             REQUIRE(jet[3] == approximately(jet[0] + 4));
-            REQUIRE(jet[4] == approximately(.5 * jet[3]));
-            REQUIRE(jet[5] == approximately(.5 * jet[2]));
+            REQUIRE(jet[4] == approximately(fp_t(.5) * jet[3]));
+            REQUIRE(jet[5] == approximately(fp_t(.5) * jet[2]));
         }
 
         {
@@ -498,10 +498,10 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[5] == approximately(jet[3] - 2));
             REQUIRE(jet[6] == approximately(jet[0] + 4));
             REQUIRE(jet[7] == approximately(jet[1] + 4));
-            REQUIRE(jet[8] == approximately(.5 * jet[6]));
-            REQUIRE(jet[9] == approximately(.5 * jet[7]));
-            REQUIRE(jet[10] == approximately(.5 * jet[4]));
-            REQUIRE(jet[11] == approximately(.5 * jet[5]));
+            REQUIRE(jet[8] == approximately(fp_t(.5) * jet[6]));
+            REQUIRE(jet[9] == approximately(fp_t(.5) * jet[7]));
+            REQUIRE(jet[10] == approximately(fp_t(.5) * jet[4]));
+            REQUIRE(jet[11] == approximately(fp_t(.5) * jet[5]));
         }
 
         {
@@ -535,13 +535,13 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[10] == approximately(jet[1] + 4));
             REQUIRE(jet[11] == approximately(jet[2] + 4));
 
-            REQUIRE(jet[12] == approximately(.5 * jet[9]));
-            REQUIRE(jet[13] == approximately(.5 * jet[10]));
-            REQUIRE(jet[14] == approximately(.5 * jet[11]));
+            REQUIRE(jet[12] == approximately(fp_t(.5) * jet[9]));
+            REQUIRE(jet[13] == approximately(fp_t(.5) * jet[10]));
+            REQUIRE(jet[14] == approximately(fp_t(.5) * jet[11]));
 
-            REQUIRE(jet[15] == approximately(.5 * jet[6]));
-            REQUIRE(jet[16] == approximately(.5 * jet[7]));
-            REQUIRE(jet[17] == approximately(.5 * jet[8]));
+            REQUIRE(jet[15] == approximately(fp_t(.5) * jet[6]));
+            REQUIRE(jet[16] == approximately(fp_t(.5) * jet[7]));
+            REQUIRE(jet[17] == approximately(fp_t(.5) * jet[8]));
 
             REQUIRE(jet[18] == approximately(1 / fp_t{3} * jet[15]));
             REQUIRE(jet[19] == approximately(1 / fp_t{3} * jet[16]));
@@ -585,13 +585,13 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[10] == approximately(jet[1] + 4));
             REQUIRE(jet[11] == approximately(jet[2] + 4));
 
-            REQUIRE(jet[12] == approximately(.5 * jet[9]));
-            REQUIRE(jet[13] == approximately(.5 * jet[10]));
-            REQUIRE(jet[14] == approximately(.5 * jet[11]));
+            REQUIRE(jet[12] == approximately(fp_t(.5) * jet[9]));
+            REQUIRE(jet[13] == approximately(fp_t(.5) * jet[10]));
+            REQUIRE(jet[14] == approximately(fp_t(.5) * jet[11]));
 
-            REQUIRE(jet[15] == approximately(.5 * jet[6]));
-            REQUIRE(jet[16] == approximately(.5 * jet[7]));
-            REQUIRE(jet[17] == approximately(.5 * jet[8]));
+            REQUIRE(jet[15] == approximately(fp_t(.5) * jet[6]));
+            REQUIRE(jet[16] == approximately(fp_t(.5) * jet[7]));
+            REQUIRE(jet[17] == approximately(fp_t(.5) * jet[8]));
 
             REQUIRE(jet[18] == approximately(1 / fp_t{3} * jet[15]));
             REQUIRE(jet[19] == approximately(1 / fp_t{3} * jet[16]));
@@ -727,8 +727,8 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[1] == 3);
             REQUIRE(jet[2] == approximately(-jet[1] + 2));
             REQUIRE(jet[3] == approximately(-jet[0] - 4));
-            REQUIRE(jet[4] == approximately(-.5 * jet[3]));
-            REQUIRE(jet[5] == approximately(-.5 * jet[2]));
+            REQUIRE(jet[4] == approximately(-fp_t(.5) * jet[3]));
+            REQUIRE(jet[5] == approximately(-fp_t(.5) * jet[2]));
         }
 
         {
@@ -754,10 +754,10 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[5] == approximately(-jet[3] + 2));
             REQUIRE(jet[6] == approximately(-jet[0] - 4));
             REQUIRE(jet[7] == approximately(-jet[1] - 4));
-            REQUIRE(jet[8] == approximately(-.5 * jet[6]));
-            REQUIRE(jet[9] == approximately(-.5 * jet[7]));
-            REQUIRE(jet[10] == approximately(-.5 * jet[4]));
-            REQUIRE(jet[11] == approximately(-.5 * jet[5]));
+            REQUIRE(jet[8] == approximately(-fp_t(.5) * jet[6]));
+            REQUIRE(jet[9] == approximately(-fp_t(.5) * jet[7]));
+            REQUIRE(jet[10] == approximately(-fp_t(.5) * jet[4]));
+            REQUIRE(jet[11] == approximately(-fp_t(.5) * jet[5]));
         }
 
         {
@@ -791,13 +791,13 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[10] == approximately(-jet[1] - 4));
             REQUIRE(jet[11] == approximately(-jet[2] - 4));
 
-            REQUIRE(jet[12] == approximately(-.5 * jet[9]));
-            REQUIRE(jet[13] == approximately(-.5 * jet[10]));
-            REQUIRE(jet[14] == approximately(-.5 * jet[11]));
+            REQUIRE(jet[12] == approximately(-fp_t(.5) * jet[9]));
+            REQUIRE(jet[13] == approximately(-fp_t(.5) * jet[10]));
+            REQUIRE(jet[14] == approximately(-fp_t(.5) * jet[11]));
 
-            REQUIRE(jet[15] == approximately(-.5 * jet[6]));
-            REQUIRE(jet[16] == approximately(-.5 * jet[7]));
-            REQUIRE(jet[17] == approximately(-.5 * jet[8]));
+            REQUIRE(jet[15] == approximately(-fp_t(.5) * jet[6]));
+            REQUIRE(jet[16] == approximately(-fp_t(.5) * jet[7]));
+            REQUIRE(jet[17] == approximately(-fp_t(.5) * jet[8]));
 
             REQUIRE(jet[18] == approximately(-1 / fp_t{3} * jet[15]));
             REQUIRE(jet[19] == approximately(-1 / fp_t{3} * jet[16]));
@@ -841,13 +841,13 @@ TEST_CASE("taylor sub")
             REQUIRE(jet[10] == approximately(-jet[1] - 4));
             REQUIRE(jet[11] == approximately(-jet[2] - 4));
 
-            REQUIRE(jet[12] == approximately(-.5 * jet[9]));
-            REQUIRE(jet[13] == approximately(-.5 * jet[10]));
-            REQUIRE(jet[14] == approximately(-.5 * jet[11]));
+            REQUIRE(jet[12] == approximately(-fp_t(.5) * jet[9]));
+            REQUIRE(jet[13] == approximately(-fp_t(.5) * jet[10]));
+            REQUIRE(jet[14] == approximately(-fp_t(.5) * jet[11]));
 
-            REQUIRE(jet[15] == approximately(-.5 * jet[6]));
-            REQUIRE(jet[16] == approximately(-.5 * jet[7]));
-            REQUIRE(jet[17] == approximately(-.5 * jet[8]));
+            REQUIRE(jet[15] == approximately(-fp_t(.5) * jet[6]));
+            REQUIRE(jet[16] == approximately(-fp_t(.5) * jet[7]));
+            REQUIRE(jet[17] == approximately(-fp_t(.5) * jet[8]));
 
             REQUIRE(jet[18] == approximately(-1 / fp_t{3} * jet[15]));
             REQUIRE(jet[19] == approximately(-1 / fp_t{3} * jet[16]));
diff --git a/test/taylor_sum.cpp b/test/taylor_sum.cpp
index 8f04ee554..ea8533078 100644
--- a/test/taylor_sum.cpp
+++ b/test/taylor_sum.cpp
@@ -33,7 +33,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -163,8 +163,8 @@ TEST_CASE("taylor sum")
             REQUIRE(jet[1] == 3);
             REQUIRE(jet[2] == approximately(fp_t{2 + 2 + 2 + 3}));
             REQUIRE(jet[3] == approximately(fp_t{5}));
-            REQUIRE(jet[4] == approximately(.5 * (jet[2] + jet[3])));
-            REQUIRE(jet[5] == approximately(.5 * (jet[2] + jet[3])));
+            REQUIRE(jet[4] == approximately(fp_t(.5) * (jet[2] + jet[3])));
+            REQUIRE(jet[5] == approximately(fp_t(.5) * (jet[2] + jet[3])));
         }
 
         {
@@ -195,11 +195,11 @@ TEST_CASE("taylor sum")
             REQUIRE(jet[6] == 5);
             REQUIRE(jet[7] == -5);
 
-            REQUIRE(jet[8] == approximately(.5 * (jet[4] + jet[6])));
-            REQUIRE(jet[9] == approximately(.5 * (jet[5] + jet[7])));
+            REQUIRE(jet[8] == approximately(fp_t(.5) * (jet[4] + jet[6])));
+            REQUIRE(jet[9] == approximately(fp_t(.5) * (jet[5] + jet[7])));
 
-            REQUIRE(jet[10] == approximately(.5 * (jet[4] + jet[6])));
-            REQUIRE(jet[11] == approximately(.5 * (jet[5] + jet[7])));
+            REQUIRE(jet[10] == approximately(fp_t(.5) * (jet[4] + jet[6])));
+            REQUIRE(jet[11] == approximately(fp_t(.5) * (jet[5] + jet[7])));
         }
 
         {
@@ -234,13 +234,13 @@ TEST_CASE("taylor sum")
             REQUIRE(jet[10] == -5);
             REQUIRE(jet[11] == 3);
 
-            REQUIRE(jet[12] == approximately(.5 * (jet[6] + jet[9])));
-            REQUIRE(jet[13] == approximately(.5 * (jet[7] + jet[10])));
-            REQUIRE(jet[14] == approximately(.5 * (jet[8] + jet[11])));
+            REQUIRE(jet[12] == approximately(fp_t(.5) * (jet[6] + jet[9])));
+            REQUIRE(jet[13] == approximately(fp_t(.5) * (jet[7] + jet[10])));
+            REQUIRE(jet[14] == approximately(fp_t(.5) * (jet[8] + jet[11])));
 
-            REQUIRE(jet[15] == approximately(.5 * (jet[6] + jet[9])));
-            REQUIRE(jet[16] == approximately(.5 * (jet[7] + jet[10])));
-            REQUIRE(jet[17] == approximately(.5 * (jet[8] + jet[11])));
+            REQUIRE(jet[15] == approximately(fp_t(.5) * (jet[6] + jet[9])));
+            REQUIRE(jet[16] == approximately(fp_t(.5) * (jet[7] + jet[10])));
+            REQUIRE(jet[17] == approximately(fp_t(.5) * (jet[8] + jet[11])));
 
             REQUIRE(jet[18] == approximately((jet[12] + jet[15]) / 3));
             REQUIRE(jet[19] == approximately((jet[13] + jet[16]) / 3));
diff --git a/test/taylor_sum_sq.cpp b/test/taylor_sum_sq.cpp
index 18711c8d2..0ecc3d7fb 100644
--- a/test/taylor_sum_sq.cpp
+++ b/test/taylor_sum_sq.cpp
@@ -49,7 +49,7 @@ auto sum_sq(const std::vector<expression> &args)
     return sum(new_args);
 }
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_t_event.cpp b/test/taylor_t_event.cpp
index 13cdbd3eb..4b7046365 100644
--- a/test/taylor_t_event.cpp
+++ b/test/taylor_t_event.cpp
@@ -41,7 +41,7 @@
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
@@ -166,7 +166,7 @@ TEST_CASE("taylor te")
 
             oss << ev_t(
                 v * v - 1e-10, kw::direction = event_direction::negative,
-                kw::callback = [](auto &, bool, int, auto...) { return true; }, kw::cooldown = -5);
+                kw::callback = [](auto &, bool, int, auto...) { return true; }, kw::cooldown = fp_t(-5));
             REQUIRE(boost::algorithm::contains(oss.str(), " event_direction::negative"));
             REQUIRE(boost::algorithm::contains(oss.str(), " terminal"));
             REQUIRE(boost::algorithm::contains(oss.str(), " auto"));
@@ -175,7 +175,7 @@ TEST_CASE("taylor te")
 
             oss << ev_t(
                 v * v - 1e-10, kw::direction = event_direction::negative,
-                kw::callback = [](auto &, bool, int, auto...) { return true; }, kw::cooldown = 1);
+                kw::callback = [](auto &, bool, int, auto...) { return true; }, kw::cooldown = fp_t(1));
             REQUIRE(boost::algorithm::contains(oss.str(), " event_direction::negative"));
             REQUIRE(boost::algorithm::contains(oss.str(), " terminal"));
             REQUIRE(boost::algorithm::contains(oss.str(), " 1"));
@@ -235,6 +235,9 @@ TEST_CASE("taylor te basic")
         using t_ev_t = typename taylor_adaptive<fp_t>::t_event_t;
         using nt_ev_t = typename taylor_adaptive<fp_t>::nt_event_t;
 
+        // NOTE: don't make the small delta too smal in single-precision.
+        const auto small_delta = std::is_same_v<fp_t, float> ? 1e-6 : 1e-10;
+
         // NOTE: test also sub-eps tolerance.
         for (auto cur_tol : {std::numeric_limits<fp_t>::epsilon(), std::numeric_limits<fp_t>::epsilon() / 100}) {
             auto counter_nt = 0u, counter_t = 0u;
@@ -248,24 +251,25 @@ TEST_CASE("taylor te basic")
                 kw::opt_level = opt_level,
                 kw::high_accuracy = high_accuracy,
                 kw::compact_mode = compact_mode,
-                kw::nt_events = {nt_ev_t(v * v - 1e-10,
-                                         [&counter_nt, &cur_time, &direction](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
-                                             // Make sure the callbacks are called in order.
-                                             if (direction) {
-                                                 REQUIRE(t > cur_time);
-                                             } else {
-                                                 REQUIRE(t < cur_time);
-                                             }
+                kw::nt_events
+                = {nt_ev_t(v * v - small_delta,
+                           [&counter_nt, &cur_time, &direction, small_delta](taylor_adaptive<fp_t> &ta_, fp_t t, int) {
+                               // Make sure the callbacks are called in order.
+                               if (direction) {
+                                   REQUIRE(t > cur_time);
+                               } else {
+                                   REQUIRE(t < cur_time);
+                               }
 
-                                             ta_.update_d_output(t);
+                               ta_.update_d_output(t);
 
-                                             const auto vel = ta_.get_d_output()[1];
-                                             REQUIRE(abs(vel * vel - 1e-10) < std::numeric_limits<fp_t>::epsilon());
+                               const auto vel = ta_.get_d_output()[1];
+                               REQUIRE(abs(vel * vel - small_delta) < std::numeric_limits<fp_t>::epsilon());
 
-                                             ++counter_nt;
+                               ++counter_nt;
 
-                                             cur_time = t;
-                                         })},
+                               cur_time = t;
+                           })},
                 kw::t_events = {t_ev_t(
                     v, kw::callback = [&counter_t, &cur_time, &direction](taylor_adaptive<fp_t> &ta_, bool mr, int) {
                         const auto t = ta_.get_time();
@@ -994,7 +998,7 @@ TEST_CASE("taylor te boolean callback")
         // Some testing for propagate_grid() too.
         ta.reset_cooldowns();
         ta.set_time(fp_t{0});
-        ta.get_state_data()[0] = -0.1;
+        ta.get_state_data()[0] = fp_t(-0.1);
         ta.get_state_data()[1] = 0;
         cur_time = -1;
         direction = true;
@@ -1092,6 +1096,7 @@ struct s11n_callback {
     }
 };
 
+HEYOKA_S11N_CALLABLE_EXPORT(s11n_callback, bool, taylor_adaptive<float> &, bool, int)
 HEYOKA_S11N_CALLABLE_EXPORT(s11n_callback, bool, taylor_adaptive<double> &, bool, int)
 HEYOKA_S11N_CALLABLE_EXPORT(s11n_callback, bool, taylor_adaptive<long double> &, bool, int)
 
diff --git a/test/taylor_tan.cpp b/test/taylor_tan.cpp
index 98f33d434..a443dae39 100644
--- a/test/taylor_tan.cpp
+++ b/test/taylor_tan.cpp
@@ -39,7 +39,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_tanh.cpp b/test/taylor_tanh.cpp
index e5ae0ec3f..363c67c33 100644
--- a/test/taylor_tanh.cpp
+++ b/test/taylor_tanh.cpp
@@ -39,7 +39,7 @@ static std::mt19937 rng;
 using namespace heyoka;
 using namespace heyoka_test;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/test/taylor_time.cpp b/test/taylor_time.cpp
index 421dc4826..6a0eb023e 100644
--- a/test/taylor_time.cpp
+++ b/test/taylor_time.cpp
@@ -39,7 +39,7 @@ using namespace heyoka;
 using namespace heyoka_test;
 namespace hy = heyoka;
 
-const auto fp_types = std::tuple<double
+const auto fp_types = std::tuple<float, double
 #if !defined(HEYOKA_ARCH_PPC)
                                  ,
                                  long double
diff --git a/tools/gha_conda_clang_tidy.sh b/tools/gha_conda_clang_tidy.sh
index f56387f57..5c6f68ffd 100755
--- a/tools/gha_conda_clang_tidy.sh
+++ b/tools/gha_conda_clang_tidy.sh
@@ -16,7 +16,7 @@ export PATH="$HOME/miniconda/bin:$PATH"
 bash miniconda.sh -b -p $HOME/miniconda
 conda config --add channels conda-forge
 conda config --set channel_priority strict
-conda create -y -q -p $deps_dir cmake c-compiler cxx-compiler clang clangxx clang-tools llvmdev tbb-devel tbb boost-cpp 'mppp>=0.27' sleef 'fmt=9.*' spdlog ninja
+conda create -y -q -p $deps_dir cmake c-compiler cxx-compiler clang clangxx clang-tools llvmdev tbb-devel tbb boost-cpp 'mppp>=0.27' sleef fmt spdlog ninja
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_conda_coverage.sh b/tools/gha_conda_coverage.sh
index 235149d08..34c7f4d0a 100644
--- a/tools/gha_conda_coverage.sh
+++ b/tools/gha_conda_coverage.sh
@@ -16,7 +16,7 @@ export PATH="$HOME/miniconda/bin:$PATH"
 bash miniconda.sh -b -p $HOME/miniconda
 conda config --add channels conda-forge
 conda config --set channel_priority strict
-conda create -y -q -p $deps_dir c-compiler cxx-compiler cmake llvmdev tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog lcov
+conda create -y -q -p $deps_dir c-compiler cxx-compiler cmake llvmdev tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog lcov
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_conda_release.sh b/tools/gha_conda_release.sh
index 28c0afbbc..9c6426c57 100644
--- a/tools/gha_conda_release.sh
+++ b/tools/gha_conda_release.sh
@@ -16,7 +16,7 @@ export PATH="$HOME/miniconda/bin:$PATH"
 bash miniconda.sh -b -p $HOME/miniconda
 conda config --add channels conda-forge
 conda config --set channel_priority strict
-conda create -y -q -p $deps_dir c-compiler cxx-compiler cmake llvmdev tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog
+conda create -y -q -p $deps_dir c-compiler cxx-compiler cmake llvmdev tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_llvm11_conda_asan.sh b/tools/gha_llvm11_conda_asan.sh
index effacca72..d8be27f7c 100644
--- a/tools/gha_llvm11_conda_asan.sh
+++ b/tools/gha_llvm11_conda_asan.sh
@@ -14,7 +14,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforg
 export deps_dir=$HOME/local
 export PATH="$HOME/mambaforge/bin:$PATH"
 bash mambaforge.sh -b -p $HOME/mambaforge
-mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=11.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog
+mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=11.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_llvm12_conda_asan.sh b/tools/gha_llvm12_conda_asan.sh
index 5d0083702..c02674c23 100644
--- a/tools/gha_llvm12_conda_asan.sh
+++ b/tools/gha_llvm12_conda_asan.sh
@@ -14,7 +14,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforg
 export deps_dir=$HOME/local
 export PATH="$HOME/mambaforge/bin:$PATH"
 bash mambaforge.sh -b -p $HOME/mambaforge
-mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=12.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog
+mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=12.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_llvm13_conda_asan.sh b/tools/gha_llvm13_conda_asan.sh
index a953b9e86..6f6a4ad60 100644
--- a/tools/gha_llvm13_conda_asan.sh
+++ b/tools/gha_llvm13_conda_asan.sh
@@ -14,7 +14,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforg
 export deps_dir=$HOME/local
 export PATH="$HOME/mambaforge/bin:$PATH"
 bash mambaforge.sh -b -p $HOME/mambaforge
-mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=13.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog
+mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=13.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_llvm14_conda_asan.sh b/tools/gha_llvm14_conda_asan.sh
index b2e0c69b4..0200f9c99 100644
--- a/tools/gha_llvm14_conda_asan.sh
+++ b/tools/gha_llvm14_conda_asan.sh
@@ -14,7 +14,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforg
 export deps_dir=$HOME/local
 export PATH="$HOME/mambaforge/bin:$PATH"
 bash mambaforge.sh -b -p $HOME/mambaforge
-mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=14.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog
+mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=14.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_llvm15_conda_asan.sh b/tools/gha_llvm15_conda_asan.sh
index 609c121db..01f866f56 100644
--- a/tools/gha_llvm15_conda_asan.sh
+++ b/tools/gha_llvm15_conda_asan.sh
@@ -14,7 +14,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforg
 export deps_dir=$HOME/local
 export PATH="$HOME/mambaforge/bin:$PATH"
 bash mambaforge.sh -b -p $HOME/mambaforge
-mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=15.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog
+mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=15.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_llvm16_conda_asan.sh b/tools/gha_llvm16_conda_asan.sh
index 24c930669..840ceedcd 100755
--- a/tools/gha_llvm16_conda_asan.sh
+++ b/tools/gha_llvm16_conda_asan.sh
@@ -14,7 +14,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforg
 export deps_dir=$HOME/local
 export PATH="$HOME/mambaforge/bin:$PATH"
 bash mambaforge.sh -b -p $HOME/mambaforge
-mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=16.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog
+mamba create -y -q -p $deps_dir c-compiler cxx-compiler cmake 'llvmdev=16.*' tbb-devel tbb boost-cpp 'mppp>=0.27' sleef xtensor xtensor-blas blas blas-devel fmt spdlog
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/gha_osx.sh b/tools/gha_osx.sh
index 9ebdd9268..12812566d 100644
--- a/tools/gha_osx.sh
+++ b/tools/gha_osx.sh
@@ -13,7 +13,7 @@ export PATH="$HOME/miniconda/bin:$PATH"
 bash miniconda.sh -b -p $HOME/miniconda
 conda config --add channels conda-forge
 conda config --set channel_priority strict
-conda create -y -q -p $deps_dir c-compiler cxx-compiler libcxx cmake llvmdev tbb-devel tbb boost-cpp sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog 'mppp>=0.27'
+conda create -y -q -p $deps_dir c-compiler cxx-compiler libcxx cmake llvmdev tbb-devel tbb boost-cpp sleef xtensor xtensor-blas blas blas-devel fmt spdlog 'mppp>=0.27'
 source activate $deps_dir
 
 # Create the build dir and cd into it.
diff --git a/tools/travis_ubuntu_ppc64.sh b/tools/travis_ubuntu_ppc64.sh
index 9f8e6a714..ff6216989 100755
--- a/tools/travis_ubuntu_ppc64.sh
+++ b/tools/travis_ubuntu_ppc64.sh
@@ -11,7 +11,7 @@ curl -L -o miniconda.sh https://github.com/conda-forge/miniforge/releases/latest
 export deps_dir=$HOME/local
 export PATH="$HOME/miniconda/bin:$PATH"
 bash miniconda.sh -b -p $HOME/miniconda
-conda create -y -q -p $deps_dir cxx-compiler c-compiler cmake llvmdev tbb-devel tbb boost-cpp sleef xtensor xtensor-blas blas blas-devel 'fmt=9.*' spdlog make mppp
+conda create -y -q -p $deps_dir cxx-compiler c-compiler cmake llvmdev tbb-devel tbb boost-cpp sleef xtensor xtensor-blas blas blas-devel fmt spdlog make mppp
 source activate $deps_dir
 
 # Create the build dir and cd into it.