From acc61b564c084c0244f05e99fe3c9e97f2f1c473 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:48:29 +0000 Subject: [PATCH] Docs: Improved benchmarks table --- README.md | 141 ++++++++++++++++++++++++++++++++++++++++++---- scripts/bench.cxx | 5 -- 2 files changed, 130 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 21426609..229b39ce 100644 --- a/README.md +++ b/README.md @@ -92,17 +92,136 @@ You can learn more about the technical implementation details in the following b ## Benchmarks -For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API. -Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements for the two most common vector-vector similarity metrics - the Cosine similarity and the Euclidean distance: - -| Type | Apple M2 Pro | Intel Sapphire Rapids | AWS Graviton 4 | -| :--------- | ----------------------------: | -------------------------------: | ------------------------------: | -| `float64` | 18.5 → 28.8 GB/s
+ 56 % | 21.9 → 41.4 GB/s
+ 89 % | 20.7 → 41.3 GB/s
+ 99 % | -| `float32` | 9.2 → 29.6 GB/s
+ 221 % | 10.9 → 95.8 GB/s
+ 779 % | 4.9 → 41.9 GB/s
+ 755 % | -| `float16` | 4.6 → 14.6 GB/s
+ 217 % | 3.1 → 108.4 GB/s
+ 3,397 % | 5.4 → 39.3 GB/s
+ 627 % | -| `bfloat16` | 4.6 → 26.3 GB/s
+ 472 % | 0.8 → 59.5 GB/s
+7,437 % | 2.5 → 29.9 GB/s
+ 1,096 % | -| `int8` | 25.8 → 47.1 GB/s
+ 83 % | 33.1 → 65.3 GB/s
+ 97 % | 35.2 → 43.5 GB/s
+ 24 % | -| `uint8` | | 32.5 → 66.5 GB/s
+ 105 % | | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NumPyC 99SimSIMD
cosine distances between 1536d vectors in float16
+ int8
+ 🚧 overflows
+ bfloat16
+ 🚧 not supported
+ float16
+ x86: 40,481 · + arm: 21,451 ops/s + float32
+ x86: 253,902 · + arm: 46,394 ops/s + float64
+ x86: 212,421 · + arm: 52,904 ops/s +
+ int8
+ x86: 10,548,600 · + arm: 11,379,300 ops/s + bfloat16
+ x86: 119,835 · + arm: 403,909 ops/s + float16
+ x86: 501,310 · + arm: 871,963 ops/s + float32
+ x86: 882,484 · + arm: 399,661 ops/s + float64
+ x86: 839,301 · + arm: 837,126 ops/s +
+ int8
+ x86: 16,151,800 · + arm: 13,524,000 ops/s + bfloat16
+ x86: 9,738,540 · + arm: 4,881,900 ops/s + float16
+ x86: 7,627,600 · + arm: 3,316,810 ops/s + float32
+ x86: 8,202,910 · + arm: 3,400,620 ops/s + float64
+ x86: 1,538,530 · + arm: 1,678,920 ops/s +
eculidean distance between 1536d vectors in float16
+ int8
+ x86: 252,113 · + arm: 177,443 ops/s + bfloat16
+ 🚧 not supported
+ float16
+ x86: 54,621 · + arm: 71,793 ops/s + float32
+ x86: 424,944 · + arm: 292,629 ops/s + float64
+ x86: 334,929 · + arm: 237,505 ops/s +
+ int8
+ x86: 6,690,110 · + arm: 4,114,160 ops/s + bfloat16
+ x86: 119,842 · + arm: 1,049,230 ops/s + float16
+ x86: 196,413 · + arm: 911,370 ops/s + float32
+ x86: 1,295,210 · + arm: 1,055,940 ops/s + float64
+ x86: 1,215,190 · + arm: 905,782 ops/s +
+ int8
+ x86: 18,989,000 · + arm: 18,878,200 ops/s + bfloat16
+ x86: 9,727,210 · + arm: 4,233,420 ops/s + float16
+ x86: 19,466,800 · + arm: 3,522,760 ops/s + float32
+ x86: 8,924,100 · + arm: 3,602,650 ops/s + float64
+ x86: 1,701,740 · + arm: 1,735,840 ops/s +
+ +> The code was compiled with GCC 12, using glibc v2.35. +> The benchmarks performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids. +> Most modern Arm-based 64-bit CPUs will have similar relative speedups. +> Variance withing x86 CPUs will be larger. Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python. Broader benchmarking results: diff --git a/scripts/bench.cxx b/scripts/bench.cxx index 76adbc2e..ca5ce0a3 100644 --- a/scripts/bench.cxx +++ b/scripts/bench.cxx @@ -865,11 +865,6 @@ int main(int argc, char **argv) { #endif #if SIMSIMD_TARGET_SVE - dense_("dot_f16_sve", simsimd_dot_f16_sve, simsimd_dot_f16_accurate); - dense_("cos_f16_sve", simsimd_cos_f16_sve, simsimd_cos_f16_accurate); - dense_("l2sq_f16_sve", simsimd_l2sq_f16_sve, simsimd_l2sq_f16_accurate); - dense_("l2_f16_sve", simsimd_l2_f16_sve, simsimd_l2_f16_accurate); - dense_("dot_f32_sve", simsimd_dot_f32_sve, simsimd_dot_f32_accurate); dense_("cos_f32_sve", simsimd_cos_f32_sve, simsimd_cos_f32_accurate); dense_("l2sq_f32_sve", simsimd_l2sq_f32_sve, simsimd_l2sq_f32_accurate);