From acc61b564c084c0244f05e99fe3c9e97f2f1c473 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 26 Nov 2024 14:48:29 +0000
Subject: [PATCH] Docs: Improved benchmarks table

---
 README.md         | 141 ++++++++++++++++++++++++++++++++++++++++++----
 scripts/bench.cxx |   5 --
 2 files changed, 130 insertions(+), 16 deletions(-)
diff --git a/README.md b/README.md
index 21426609..229b39ce 100644
--- a/README.md
+++ b/README.md
@@ -92,17 +92,136 @@ You can learn more about the technical implementation details in the following b
 
 ## Benchmarks
 
-For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
-Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements for the two most common vector-vector similarity metrics - the Cosine similarity and the Euclidean distance:
-
-| Type       |                  Apple M2 Pro |            Intel Sapphire Rapids |                  AWS Graviton 4 |
-| :--------- | ----------------------------: | -------------------------------: | ------------------------------: |
-| `float64`  | 18.5 → 28.8 GB/s <br/> + 56 % |    21.9 → 41.4 GB/s <br/> + 89 % |   20.7 → 41.3 GB/s <br/> + 99 % |
-| `float32`  | 9.2 → 29.6 GB/s <br/> + 221 % |   10.9 → 95.8 GB/s <br/> + 779 % |   4.9 → 41.9 GB/s <br/> + 755 % |
-| `float16`  | 4.6 → 14.6 GB/s <br/> + 217 % | 3.1 → 108.4 GB/s <br/> + 3,397 % |   5.4 → 39.3 GB/s <br/> + 627 % |
-| `bfloat16` | 4.6 → 26.3 GB/s <br/> + 472 % |   0.8 → 59.5 GB/s <br/> +7,437 % | 2.5 → 29.9 GB/s <br/> + 1,096 % |
-| `int8`     | 25.8 → 47.1 GB/s <br/> + 83 % |    33.1 → 65.3 GB/s <br/> + 97 % |   35.2 → 43.5 GB/s <br/> + 24 % |
-| `uint8`    |                               |   32.5 → 66.5 GB/s <br/> + 105 % |                                 |
+<table style="width: 100%; text-align: center; table-layout: fixed;">
+  <colgroup>
+    <col style="width: 33%;">
+    <col style="width: 33%;">
+    <col style="width: 33%;">
+  </colgroup>
+  <tr>
+    <th align="center">NumPy</th>
+    <th align="center">C 99</th>
+    <th align="center">SimSIMD</th>
+  </tr>
+  <!-- Cosine distance with different precision levels -->
+  <tr>
+    <td colspan="4" align="center">cosine distances between 1536d vectors in <code>float16</code></td>
+  </tr>
+  <tr>
+    <td align="center"> <!-- scipy.spatial.distance.cosine -->
+      <code>int8</code><br/>
+      🚧 overflows<br/>
+      <code>bfloat16</code><br/>
+      🚧 not supported<br/>
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>40,481</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>21,451</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>253,902</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>46,394</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>212,421</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>52,904</b> ops/s
+    </td>
+    <td align="center"> <!-- serial -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>10,548,600</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>11,379,300</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>119,835</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>403,909</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>501,310</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>871,963</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>882,484</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>399,661</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>839,301</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>837,126</b> ops/s
+    </td>
+    <td align="center"> <!-- simsimd -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>16,151,800</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>13,524,000</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>9,738,540</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>4,881,900</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>7,627,600</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,316,810</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>8,202,910</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,400,620</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,538,530</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,678,920</b> ops/s
+    </td>
+  </tr>
+  <!-- Euclidean distance with different precision level -->
+  <tr>
+    <td colspan="4" align="center">eculidean distance between 1536d vectors in <code>float16</code></td>
+  </tr>
+  <tr>
+    <td align="center"> <!-- scipy.spatial.distance.sqeuclidean -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>252,113</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>177,443</b> ops/s
+      <code>bfloat16</code><br/>
+      🚧 not supported<br/>
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>54,621</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>71,793</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>424,944</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>292,629</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>334,929</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>237,505</b> ops/s
+    </td>
+    <td align="center"> <!-- serial -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>6,690,110</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>4,114,160</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>119,842</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,049,230</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>196,413</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>911,370</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,295,210</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,055,940</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,215,190</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>905,782</b> ops/s
+    </td>
+    <td align="center"> <!-- simsimd -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>18,989,000</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>18,878,200</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>9,727,210</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>4,233,420</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>19,466,800</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,522,760</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>8,924,100</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,602,650</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,701,740</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,735,840</b> ops/s
+    </td>
+  </tr>
+  <!-- Bilinear forms -->
+  <!-- Sparse set intersections -->
+</table>
+
+> The code was compiled with GCC 12, using glibc v2.35.
+> The benchmarks performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
+> Most modern Arm-based 64-bit CPUs will have similar relative speedups.
+> Variance withing x86 CPUs will be larger.
 
 Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python.
 Broader benchmarking results:
diff --git a/scripts/bench.cxx b/scripts/bench.cxx
index 76adbc2e..ca5ce0a3 100644
--- a/scripts/bench.cxx
+++ b/scripts/bench.cxx
@@ -865,11 +865,6 @@ int main(int argc, char **argv) {
 #endif
 
 #if SIMSIMD_TARGET_SVE
-    dense_<f16_k>("dot_f16_sve", simsimd_dot_f16_sve, simsimd_dot_f16_accurate);
-    dense_<f16_k>("cos_f16_sve", simsimd_cos_f16_sve, simsimd_cos_f16_accurate);
-    dense_<f16_k>("l2sq_f16_sve", simsimd_l2sq_f16_sve, simsimd_l2sq_f16_accurate);
-    dense_<f16_k>("l2_f16_sve", simsimd_l2_f16_sve, simsimd_l2_f16_accurate);
-
     dense_<f32_k>("dot_f32_sve", simsimd_dot_f32_sve, simsimd_dot_f32_accurate);
     dense_<f32_k>("cos_f32_sve", simsimd_cos_f32_sve, simsimd_cos_f32_accurate);
     dense_<f32_k>("l2sq_f32_sve", simsimd_l2sq_f32_sve, simsimd_l2sq_f32_accurate);

NumPy	C 99	SimSIMD
cosine distances between 1536d vectors in `float16`
+ `int8` + 🚧 overflows + `bfloat16` + 🚧 not supported + `float16` + x86: 40,481 · + arm: 21,451 ops/s + `float32` + x86: 253,902 · + arm: 46,394 ops/s + `float64` + x86: 212,421 · + arm: 52,904 ops/s +	+ `int8` + x86: 10,548,600 · + arm: 11,379,300 ops/s + `bfloat16` + x86: 119,835 · + arm: 403,909 ops/s + `float16` + x86: 501,310 · + arm: 871,963 ops/s + `float32` + x86: 882,484 · + arm: 399,661 ops/s + `float64` + x86: 839,301 · + arm: 837,126 ops/s +	+ `int8` + x86: 16,151,800 · + arm: 13,524,000 ops/s + `bfloat16` + x86: 9,738,540 · + arm: 4,881,900 ops/s + `float16` + x86: 7,627,600 · + arm: 3,316,810 ops/s + `float32` + x86: 8,202,910 · + arm: 3,400,620 ops/s + `float64` + x86: 1,538,530 · + arm: 1,678,920 ops/s +
eculidean distance between 1536d vectors in `float16`
+ `int8` + x86: 252,113 · + arm: 177,443 ops/s + `bfloat16` + 🚧 not supported + `float16` + x86: 54,621 · + arm: 71,793 ops/s + `float32` + x86: 424,944 · + arm: 292,629 ops/s + `float64` + x86: 334,929 · + arm: 237,505 ops/s +	+ `int8` + x86: 6,690,110 · + arm: 4,114,160 ops/s + `bfloat16` + x86: 119,842 · + arm: 1,049,230 ops/s + `float16` + x86: 196,413 · + arm: 911,370 ops/s + `float32` + x86: 1,295,210 · + arm: 1,055,940 ops/s + `float64` + x86: 1,215,190 · + arm: 905,782 ops/s +	+ `int8` + x86: 18,989,000 · + arm: 18,878,200 ops/s + `bfloat16` + x86: 9,727,210 · + arm: 4,233,420 ops/s + `float16` + x86: 19,466,800 · + arm: 3,522,760 ops/s + `float32` + x86: 8,924,100 · + arm: 3,602,650 ops/s + `float64` + x86: 1,701,740 · + arm: 1,735,840 ops/s +