Faster integer division

shibatch · Sep 6, 2024 · db5bb9a · db5bb9a
1 parent 3423ad7
commit db5bb9a
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 17 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -229,7 +229,7 @@ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT WIN32)
   elseif(COMMAND_GOLD)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
   endif()
-elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND WIN32)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /clang:-fconstexpr-steps=1000000000")
   set(INLINE_CXX_FLAGS "/clang:-mllvm;/clang:-inline-threshold=100000")
 elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")

diff --git a/src/include/tlfloat/bigint.hpp b/src/include/tlfloat/bigint.hpp
@@ -32,7 +32,7 @@
 /*! \endcond */
 
 #ifndef TLFLOAT_DISABLE_ARCH_OPTIMIZATION
-#if defined(__GNUC__) || defined(__clang__)
+#if defined(__GNUC__) || (defined(__clang__) && !defined(_MSC_VER))
 
 #if defined(__x86_64__) && !defined(__CUDA_ARCH__)
 #include <x86intrin.h>
@@ -57,7 +57,7 @@
 
 #undef TLFLOAT_NOINLINE
 #define TLFLOAT_NOINLINE __attribute__((noinline))
-#endif // #if defined(__GNUC__) || defined(__clang__)
+#endif // #if defined(__GNUC__) || (defined(__clang__) && !defined(_MSC_VER))
 
 #ifdef _MSC_VER
 #if !defined(__CUDA_ARCH__)
@@ -72,8 +72,12 @@
 
 #ifdef TLFLOAT_ENABLE_INLINING
 #undef TLFLOAT_INLINE
+#if !defined(__clang__)
 #define TLFLOAT_INLINE __forceinline
+#else
+#define TLFLOAT_INLINE __attribute__((always_inline))
 #endif
+#endif // #ifdef TLFLOAT_ENABLE_INLINING
 #endif // #if !defined(__CUDA_ARCH__)
 #endif // #ifdef _MSC_VER
 #endif // #ifndef TLFLOAT_DISABLE_ARCH_OPTIMIZATION
@@ -687,17 +691,11 @@ namespace tlfloat {
     }
 
     constexpr TLFLOAT_INLINE BigUInt operator/(const BigUInt& rhs) const {
-      if (rhs == 1) return *this;
-      BigUInt q = this->mulhi(rhs.reciprocal());
-      if (!(rhs > *this - q * rhs)) q++;
-      return q;
+      return div(*this, rhs).first;
     }
 
     constexpr TLFLOAT_INLINE BigUInt operator%(const BigUInt& rhs) const {
-      BigUInt q = this->mulhi(rhs.reciprocal());
-      BigUInt m = *this - q * rhs;
-      if (!(rhs > m)) m -= rhs;
-      return m;
+      return div(*this, rhs).second;
     }
 
     /** This method returns ((1 << N) / *this) */
@@ -888,6 +886,11 @@ namespace tlfloat {
       return xpair<BigUInt, BigUInt>(q, m);
     }
 
+    /** This method performs division and modulo at a time. */
+    constexpr TLFLOAT_INLINE xpair<BigUInt, BigUInt> divmod(const BigUInt& rhs) const {
+      return div(*this, rhs);
+    }
+
     /** This method finds the quotient and remainder of (*this << ((1
 	<< N)-1)) divided by (rhs | (1 << ((1 << N)-1))) at a time. Give
 	rhs.reciprocal2() as the second argument. */
@@ -1184,6 +1187,10 @@ namespace tlfloat {
     constexpr TLFLOAT_INLINE xpair<BigUInt, BigUInt> divmod2(const BigUInt& rhs) const {
       return div(BigUInt<7>(*this) << ((1 << 6)-1), rhs | (1ULL << ((1 << 6)-1)));
     }
+
+    constexpr TLFLOAT_INLINE xpair<BigUInt, BigUInt> divmod(const BigUInt& rhs) const {
+      return xpair<BigUInt, BigUInt> { *this / rhs, *this % rhs };
+    }
   };
 
   /**

diff --git a/src/tester/test_bigint2.cpp b/src/tester/test_bigint2.cpp
@@ -25,7 +25,7 @@ static_assert(is_trivially_copyable_v<BigInt<9>> == true);
 static_assert(is_trivially_copyable_v<BigInt<10>> == true);
 
 template<int N>
-xpair<BigUInt<N+1>, BigUInt<N+1>> xdivmod(BigUInt<N> n, BigUInt<N> d) {
+xpair<BigUInt<N+1>, BigUInt<N+1>> xdivmod2(BigUInt<N> n, BigUInt<N> d) {
   BigUInt<N+1> xn = BigUInt<N+1>(n) << ((1 << N) - 1);
   BigUInt<N+1> xd = d | (BigUInt<N+1>(1) << ((1 << N)-1));
   return xpair<BigUInt<N+1>, BigUInt<N+1>>(BigUInt<N+1>(xn / xd), BigUInt<N+1>(xn % xd));
@@ -53,7 +53,7 @@ void doTestRec2(BigUInt<N> d) {
 
 template<int N>
 void doTestDivmod2(BigUInt<N> n, BigUInt<N> d) {
-  auto c = xdivmod(n, d);
+  auto c = xdivmod2(n, d);
 
   auto t = n.divmod2(d, d.reciprocal2());
   if (c.first != t.first || c.second != t.second) {
@@ -79,6 +79,18 @@ void doTestDivmod2(BigUInt<N> n, BigUInt<N> d) {
     cout << "c.r = " << toHexString(c.second) << endl;
     exit(-1);
   }
+
+  if (d != 0) {
+    t = n.divmod(d);
+    if (t.second >= d || t.first * d + t.second != n) {
+      cout << "N    = " << N << endl;
+      cout << "n    = " << toHexString(n) << " " << n << endl;
+      cout << "d    = " << toHexString(d) << " " << d << endl;
+      cout << "t.q  = " << toHexString(t.first ) << endl;
+      cout << "t.r  = " << toHexString(t.second) << endl;
+      exit(-1);
+    }
+  }
 }
 
 template<int N>

diff --git a/winbuild-clang.bat b/winbuild-clang.bat
@@ -1,5 +1,6 @@
 @echo off
 set CLANGINSTALLDIR=%VCINSTALLDIR%Tools\Llvm\x64
+set INSTALLDIR=tlfloat_install
 
 if NOT exist winbuild-clang.bat exit /b 255
 
@@ -14,8 +15,6 @@ echo Edit this batch file to set CLANGINSTALLDIR correctly.
 exit /b 255
 )
 
-set INSTALLDIR=tlfloat_install
-
 if %VSCMD_ARG_HOST_ARCH%==x86 call "%VCINSTALLDIR%Auxiliary\Build\vcvars64.bat"
 
 if exist build\ rmdir /S /Q build

diff --git a/winbuild-msvc.bat b/winbuild-msvc.bat
@@ -1,13 +1,13 @@
 @echo off
+set INSTALLDIR=tlfloat_install
+
 if NOT exist winbuild-msvc.bat exit /b 255
 
 if "%VSCMD_ARG_HOST_ARCH%"=="" (
 echo Run this batch file from Developer Command Prompt for VS 20XX
 exit /b 255
 )
 
-set INSTALLDIR=tlfloat_install
-
 if %VSCMD_ARG_HOST_ARCH%==x86 call "%VCINSTALLDIR%Auxiliary\Build\vcvars64.bat"
 
 if exist build\ rmdir /S /Q build