Skip to content

Commit

Permalink
Merge pull request #34 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
118x faster than GCC 12: KL & JS divergence with AVX-512FP16
  • Loading branch information
ashvardanian authored Oct 23, 2023
2 parents 8f2c888 + 02cced0 commit 7730e37
Show file tree
Hide file tree
Showing 11 changed files with 331 additions and 412 deletions.
10 changes: 4 additions & 6 deletions .github/workflows/prerelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ permissions:
contents: read

jobs:

test_python:
name: Test Python ${{ matrix.python-version }} on ${{ matrix.architecture }} ${{ matrix.os }}
runs-on: ${{ matrix.os }}
Expand All @@ -34,7 +33,7 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --no-cache-dir --upgrade pip
Expand All @@ -59,7 +58,7 @@ jobs:
run: |
python -c "import simsimd; print(simsimd.get_capabilities())"
pytest python/test.py -s -x -v
test_javascript:
name: Test JavaScript
runs-on: ubuntu-latest
Expand All @@ -68,13 +67,12 @@ jobs:
os: [ubuntu-22.04, macOS-11, windows-2022]
node-version: [18.x]
steps:

- uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: '18.x'
node-version: "18.x"

- name: Build locally
run: npm install

Expand Down
25 changes: 11 additions & 14 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,15 @@ permissions:
id-token: write

jobs:

versioning:
name: Semantic Release
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
persist-credentials: false
- uses: actions/setup-node@v3
- run: npm install --save-dev @semantic-release/exec @semantic-release/git conventional-changelog-eslint semantic-release && npx semantic-release

- uses: actions/checkout@v3
with:
persist-credentials: false
- uses: actions/setup-node@v3
- run: npm install --save-dev @semantic-release/exec @semantic-release/git conventional-changelog-eslint semantic-release && npx semantic-release

rebase:
name: Rebase Dev. Branch
Expand All @@ -43,7 +41,7 @@ jobs:
git fetch origin main
git checkout main-dev
git rebase origin/main
- name: Push changes
uses: CasperWA/push-protected@v2
with:
Expand All @@ -63,13 +61,13 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
ref: 'main'
ref: "main"
- uses: actions/setup-python@v3

- name: Setup Docker
if: matrix.os == 'ubuntu-22.04'
uses: docker-practice/actions-setup-docker@master

- name: Setup QEMU
if: matrix.os == 'ubuntu-22.04'
uses: docker/[email protected]
Expand All @@ -83,8 +81,7 @@ jobs:
- uses: actions/upload-artifact@v3
with:
path: ./wheelhouse/*.whl



publish_python:
name: Publish Python
needs: build_wheels
Expand Down Expand Up @@ -115,7 +112,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
ref: 'main'
ref: "main"
- run: git submodule update --init --recursive
- uses: actions/setup-node@v3
with:
Expand Down
21 changes: 13 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ SimSIMD leverages SIMD intrinsics, capabilities that only select compilers effec
-__3-200x faster__ than NumPy and SciPy distance functions.
- ✅ Euclidean (L2), Inner Product, and Cosine (Angular) spatial distances.
- ✅ Hamming (~ Manhattan) and Jaccard (~ Tanimoto) binary distances.
- ✅ Kullback-Leibler and Jensen–Shannon divergences for probability distributions.
- ✅ Single-precision `f32`, half-precision `f16`, `i8`, and binary vectors.
- ✅ Compatible with GCC and Clang on MacOS and Linux, and MinGW on Windows.
- ✅ Compatible with NumPy, PyTorch, TensorFlow, and other tensors.
Expand All @@ -28,18 +29,19 @@ Given 1000 embeddings from OpenAI Ada API with 1536 dimensions, running on the A
| `numpy.inner` | `inner` | __2 x__ | __9 x__ | __18 x__ |
| `scipy.spatial.distance.cosine` | `cosine` | __32 x__ | __79 x__ | __133 x__ |
| `scipy.spatial.distance.sqeuclidean` | `sqeuclidean` | __5 x__ | __26 x__ | __17 x__ |
| `scipy.spatial.distance.jensenshannon` | `jensenshannon` | __41 x__ | __76 x__ | |
| `scipy.spatial.distance.jensenshannon` | `jensenshannon` | __31 x__ | __53 x__ | |
| `scipy.special.kl_div` | `kullbackleibler` | __21 x__ | __18 x__ | |

### Intel Sapphire Rapids

On the Intel Sapphire Rapids platform, SimSIMD was benchmarked against autovectorized-code using GCC 12. GCC handles single-precision `float` and `int8_t` well. However, it fails on `_Float16` arrays, which has been part of the C language since 2011.
On the Intel Sapphire Rapids platform, SimSIMD was benchmarked against auto-vectorized code using GCC 12. GCC handles single-precision `float` and `int8_t` well. However, it fails on `_Float16` arrays, which has been part of the C language since 2011.

| | GCC 12 `f32` | GCC 12 `f16` | SimSIMD `f16` | `f16` improvement |
| :------------ | -----------: | -----------: | ------------: | ----------------: |
| `cosine` | 3.28 M/s | 336.29 k/s | 6.88 M/s | __20 x__ |
| `sqeuclidean` | 4.62 M/s | 147.25 k/s | 5.32 M/s | __36 x__ |
| `inner` | 3.81 M/s | 192.02 k/s | 5.99 M/s | __31 x__ |
| | GCC 12 `f32` | GCC 12 `f16` | SimSIMD `f16` | `f16` improvement |
| :-------------- | -----------: | -----------: | ------------: | ----------------: |
| `cosine` | 3.28 M/s | _336.29 k/s_ | _6.88 M/s_ | __20 x__ |
| `sqeuclidean` | 4.62 M/s | _147.25 k/s_ | _5.32 M/s_ | __36 x__ |
| `inner` | 3.81 M/s | _192.02 k/s_ | _5.99 M/s_ | __31 x__ |
| `jensenshannon` | 1.18 M/s | _18.13 k/s_ | _2.14 M/s_ | __118 x__ |

__Technical Insights__:

Expand Down Expand Up @@ -168,7 +170,10 @@ Should you wish to integrate SimSIMD within USearch, simply compile USearch with
__To rerun experiments__ utilize the following command:

```sh
cmake -DCMAKE_BUILD_TYPE=Release -DSIMSIMD_BUILD_BENCHMARKS=1 -B ./build_release && make -C ./build_release && ./build_release/simsimd_bench
cmake -DCMAKE_BUILD_TYPE=Release -DSIMSIMD_BUILD_BENCHMARKS=1 -B ./build_release
cmake --build build_release --config Release
./build_release/simsimd_bench
./build_release/simsimd_bench --benchmark_filter=js
```

__To test and benchmark with Python bindings__:
Expand Down
20 changes: 17 additions & 3 deletions cpp/bench.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include <benchmark/benchmark.h>

#define SIMSIMD_RSQRT simsimd_approximate_inverse_square_root
#define SIMSIMD_LOG simsimd_approximate_log
#define SIMSIMD_RSQRT sqrtf
#define SIMSIMD_LOG logf
#include <simsimd/simsimd.h>

namespace bm = benchmark;
Expand Down Expand Up @@ -44,7 +44,7 @@ template <typename scalar_at, std::size_t dimensions_ak> struct vectors_pair_gt
a2_sum = std::sqrt(a2_sum);
b2_sum = std::sqrt(b2_sum);
for (std::size_t i = 0; i != dimensions_ak; ++i)
a[i] /= a2_sum, b[i] /= b2_sum;
a[i] = static_cast<scalar_at>(a[i] / a2_sum), b[i] = static_cast<scalar_at>(b[i] / b2_sum);
}
}
};
Expand Down Expand Up @@ -129,10 +129,14 @@ int main(int argc, char** argv) {
register_<simsimd_f16_t>("neon_f16_ip", simsimd_neon_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("neon_f16_cos", simsimd_neon_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("neon_f16_l2sq", simsimd_neon_f16_l2sq, simsimd_accurate_f16_l2sq);
register_<simsimd_f16_t>("neon_f16_kl", simsimd_neon_f16_kl, simsimd_accurate_f16_kl);
register_<simsimd_f16_t>("neon_f16_js", simsimd_neon_f16_js, simsimd_accurate_f16_js);

register_<simsimd_f32_t>("neon_f32_ip", simsimd_neon_f32_ip, simsimd_accurate_f32_ip);
register_<simsimd_f32_t>("neon_f32_cos", simsimd_neon_f32_cos, simsimd_accurate_f32_cos);
register_<simsimd_f32_t>("neon_f32_l2sq", simsimd_neon_f32_l2sq, simsimd_accurate_f32_l2sq);
register_<simsimd_f32_t>("neon_f32_kl", simsimd_neon_f32_kl, simsimd_accurate_f32_kl);
register_<simsimd_f32_t>("neon_f32_js", simsimd_neon_f32_js, simsimd_accurate_f32_js);

register_<simsimd_i8_t>("neon_i8_cos", simsimd_neon_i8_cos, simsimd_accurate_i8_cos);
register_<simsimd_i8_t>("neon_i8_l2sq", simsimd_neon_i8_l2sq, simsimd_accurate_i8_l2sq);
Expand All @@ -152,6 +156,8 @@ int main(int argc, char** argv) {
register_<simsimd_f16_t>("avx2_f16_ip", simsimd_avx2_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("avx2_f16_cos", simsimd_avx2_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("avx2_f16_l2sq", simsimd_avx2_f16_l2sq, simsimd_accurate_f16_l2sq);
register_<simsimd_f16_t>("avx2_f16_kl", simsimd_avx2_f16_kl, simsimd_accurate_f16_kl);
register_<simsimd_f16_t>("avx2_f16_js", simsimd_avx2_f16_js, simsimd_accurate_f16_js);

register_<simsimd_i8_t>("avx2_i8_cos", simsimd_avx2_i8_cos, simsimd_accurate_i8_cos);
register_<simsimd_i8_t>("avx2_i8_l2sq", simsimd_avx2_i8_l2sq, simsimd_accurate_i8_l2sq);
Expand All @@ -161,22 +167,30 @@ int main(int argc, char** argv) {
register_<simsimd_f16_t>("avx512_f16_ip", simsimd_avx512_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("avx512_f16_cos", simsimd_avx512_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("avx512_f16_l2sq", simsimd_avx512_f16_l2sq, simsimd_accurate_f16_l2sq);
register_<simsimd_f16_t>("avx512_f16_kl", simsimd_avx512_f16_kl, simsimd_accurate_f16_kl);
register_<simsimd_f16_t>("avx512_f16_js", simsimd_avx512_f16_js, simsimd_accurate_f16_js);

register_<simsimd_i8_t>("avx512_i8_cos", simsimd_avx512_i8_cos, simsimd_accurate_i8_cos);
register_<simsimd_i8_t>("avx512_i8_l2sq", simsimd_avx512_i8_l2sq, simsimd_accurate_i8_l2sq);

register_<simsimd_f32_t>("avx512_f32_ip", simsimd_avx512_f32_ip, simsimd_accurate_f32_ip);
register_<simsimd_f32_t>("avx512_f32_cos", simsimd_avx512_f32_cos, simsimd_accurate_f32_cos);
register_<simsimd_f32_t>("avx512_f32_l2sq", simsimd_avx512_f32_l2sq, simsimd_accurate_f32_l2sq);
register_<simsimd_f32_t>("avx512_f32_kl", simsimd_avx512_f32_kl, simsimd_accurate_f32_kl);
register_<simsimd_f32_t>("avx512_f32_js", simsimd_avx512_f32_js, simsimd_accurate_f32_js);
#endif

register_<simsimd_f16_t>("serial_f16_ip", simsimd_serial_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("serial_f16_cos", simsimd_serial_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("serial_f16_l2sq", simsimd_serial_f16_l2sq, simsimd_accurate_f16_l2sq);
register_<simsimd_f16_t>("serial_f16_kl", simsimd_serial_f16_kl, simsimd_accurate_f16_kl);
register_<simsimd_f16_t>("serial_f16_js", simsimd_serial_f16_js, simsimd_accurate_f16_js);

register_<simsimd_f32_t>("serial_f32_ip", simsimd_serial_f32_ip, simsimd_accurate_f32_ip);
register_<simsimd_f32_t>("serial_f32_cos", simsimd_serial_f32_cos, simsimd_accurate_f32_cos);
register_<simsimd_f32_t>("serial_f32_l2sq", simsimd_serial_f32_l2sq, simsimd_accurate_f32_l2sq);
register_<simsimd_f32_t>("serial_f32_kl", simsimd_serial_f32_kl, simsimd_accurate_f32_kl);
register_<simsimd_f32_t>("serial_f32_js", simsimd_serial_f32_js, simsimd_accurate_f32_js);

register_<simsimd_i8_t>("serial_i8_cos", simsimd_serial_i8_cos, simsimd_accurate_i8_cos);
register_<simsimd_i8_t>("serial_i8_l2sq", simsimd_serial_i8_l2sq, simsimd_accurate_i8_l2sq);
Expand Down
4 changes: 1 addition & 3 deletions include/simsimd/binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,11 @@ simsimd_sve_b8_jaccard(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_siz
__attribute__((target("avx512vpopcntdq,avx512vl,avx512bw,avx512f"))) //
inline static simsimd_f32_t
simsimd_avx512_b8_hamming(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words) {
__m512i differences_vec = _mm512_setzero_si512(), union_vec = _mm512_setzero_si512();
__m512i differences_vec = _mm512_setzero_si512();
for (simsimd_size_t i = 0; i < n_words; i += 64) {

// Compute mask for tail elements
__mmask64 mask = (i + 64 <= n_words) ? 0xFFFFFFFFFFFFFFFF : (((1ull << (n_words - i)) - 1ull));

__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a + i);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b + i);
__m512i xor_vec = _mm512_xor_si512(a_vec, b_vec);
Expand All @@ -161,7 +160,6 @@ simsimd_avx512_b8_jaccard(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_

// Compute mask for tail elements
__mmask64 mask = (i + 64 <= n_words) ? 0xFFFFFFFFFFFFFFFF : (((1ull << (n_words - i)) - 1ull));

__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a + i);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b + i);
__m512i and_vec = _mm512_and_si512(a_vec, b_vec);
Expand Down
Loading

0 comments on commit 7730e37

Please sign in to comment.