Merge branch 'main-dev' of https://github.com/ashvardanian/SimSIMD in…

…to main-dev
ashvardanian · Oct 27, 2024 · 99d810d · 99d810d
2 parents 047c340 + 86a47eb
commit 99d810d
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ Implemented distance functions include:
 - Set Intersections for Sparse Vectors and Text Analysis. _[docs][docs-sparse]_
 - Mahalanobis distance and Quadratic forms for Scientific Computing. _[docs][docs-curved]_
 - Kullback-Leibler and Jensen–Shannon divergences for probability distributions. _[docs][docs-probability]_
+- Fused-Multiply-Add (FMA) and Weighted Sums to replace BLAS level 1 functions. _[docs][docs-fma]_
 - For Levenshtein, Needleman–Wunsch, and Smith-Waterman, check [StringZilla][stringzilla].
 - 🔜 Haversine and Vincenty's formulae for Geospatial Analysis.
 
@@ -61,6 +62,7 @@ Implemented distance functions include:
 [docs-binary]: https://github.com/ashvardanian/SimSIMD/pull/138
 [docs-dot]: #complex-dot-products-conjugate-dot-products-and-complex-numbers
 [docs-probability]: #logarithms-in-kullback-leibler--jensenshannon-divergences
+[docs-fma]: #mixed-precision-in-fused-multiply-add-and-weighted-sums
 [scipy]: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html#module-scipy.spatial.distance
 [numpy]: https://numpy.org/doc/stable/reference/generated/numpy.inner.html
 [stringzilla]: https://github.com/ashvardanian/stringzilla
@@ -122,7 +124,8 @@ Use the following snippet to install SimSIMD and list available hardware acceler
 
 ```sh
 pip install simsimd
-python -c "import simsimd; print(simsimd.get_capabilities())"
+python -c "import simsimd; print(simsimd.get_capabilities())"   # for hardware introspection
+python -c "import simsimd; help(simsimd)"                       # for documentation
 ```
 
 With precompiled binaries, SimSIMD ships `.pyi` interface files for type hinting and static analysis.
@@ -929,6 +932,36 @@ Jensen-Shannon divergence is a symmetrized and smoothed version of the Kullback-
 
 Both functions are defined for non-negative numbers, and the logarithm is a key part of their computation.
 
+### Mixed Precision in Fused-Multiply-Add and Weighted Sums
+
+The Fused-Multiply-Add (FMA) operation is a single operation that combines element-wise multiplication and addition with different scaling factors.
+The Weighted Sum is it's simplified variant without element-wise multiplication.
+
+```math
+\text{FMA}_i(A, B, C, \alpha, \beta) = \alpha \cdot A_i \cdot B_i + \beta \cdot C_i
+```
+
+```math
+\text{WSum}_i(A, B, \alpha, \beta) = \alpha \cdot A_i + \beta \cdot B_i
+```
+
+In NumPy terms, the implementation may look like:
+
+```py
+import numpy as np
+def wsum(A: np.ndarray, B: np.ndarray, Alpha: float, Beta: float) -> np.ndarray:
+    assert A.dtype == B.dtype, "Input types must match and affect the output style"
+    return (Alpha * A + Beta * B).astype(A.dtype)
+def fma(A: np.ndarray, B: np.ndarray, C: np.ndarray, Alpha: float, Beta: float) -> np.ndarray:
+    assert A.dtype == B.dtype and A.dtype == C.dtype, "Input types must match and affect the output style"
+    return (Alpha * A * B + Beta * C).astype(A.dtype)
+```
+
+The tricky part is implementing those operations in mixed precision, where the scaling factors are of different precision than the input and output vectors.
+SimSIMD uses double-precision floating-point scaling factors for any input and output precision, including `i8` and `u8` integers and `f16` and `bf16` floats.
+Depending on the generation of the CPU, given native support for `f16` addition and multiplication, the `f16` temporaries are used for `i8` and `u8` multiplication, scaling, and addition.
+For `bf16`, native support is generally limited to dot-products with subsequent partial accumulation, which is not enough for the FMA and WSum operations, so `f32` is used as a temporary.
+
 ### Auto-Vectorization & Loop Unrolling
 
 On the Intel Sapphire Rapids platform, SimSIMD was benchmarked against auto-vectorized code using GCC 12.

diff --git a/python/annotations/__init__.pyi b/python/annotations/__init__.pyi
@@ -27,6 +27,8 @@ _MetricType = Literal[
     "intersection",
     "bilinear",
     "mahalanobis",
+    "fma",
+    "wsum",
 ]
 _IntegralType = Literal[
     # Booleans
@@ -115,8 +117,9 @@ def cdist(
     *,
     threads: int = 1,
     dtype: Optional[Union[_IntegralType, _FloatType, _ComplexType]] = None,
+    out: Optional[_BufferType] = None,
     out_dtype: Union[_FloatType, _ComplexType] = "d",
-) -> Union[float, complex, DistancesTensor]: ...
+) -> Optional[Union[float, complex, DistancesTensor]]: ...
 
 # ---------------------------------------------------------------------
 # Vector-vector dot products for real and complex numbers
@@ -129,7 +132,10 @@ def inner(
     b: _BufferType,
     /,
     dtype: Optional[Union[_FloatType, _ComplexType]] = None,
-) -> Union[float, complex, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType, _ComplexType] = "d",
+) -> Optional[Union[float, complex, DistancesTensor]]: ...
 
 # Dot product, similar to: `numpy.dot`.
 # https://numpy.org/doc/stable/reference/generated/numpy.dot.html
@@ -138,7 +144,10 @@ def dot(
     b: _BufferType,
     /,
     dtype: Optional[Union[_FloatType, _ComplexType]] = None,
-) -> Union[float, complex, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType, _ComplexType] = None,
+) -> Optional[Union[float, complex, DistancesTensor]]: ...
 
 # Vector-vector dot product for complex conjugates, similar to: `numpy.vdot`.
 # https://numpy.org/doc/stable/reference/generated/numpy.vdot.html
@@ -147,7 +156,10 @@ def vdot(
     b: _BufferType,
     /,
     dtype: Optional[_ComplexType] = None,
-) -> Union[complex, DistancesTensor]: ...
+    *,
+    out: Optional[Union[float, complex, DistancesTensor]] = None,
+    out_dtype: Optional[_ComplexType] = None,
+) -> Optional[Union[complex, DistancesTensor]]: ...
 
 # ---------------------------------------------------------------------
 # Vector-vector spatial distance metrics for real and integer numbers
@@ -161,7 +173,10 @@ def sqeuclidean(
     b: _BufferType,
     /,
     dtype: Optional[Union[_IntegralType, _FloatType]] = None,
-) -> Union[float, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType] = None,
+) -> Optional[Union[float, DistancesTensor]]: ...
 
 # Vector-vector cosine distance, similar to: `scipy.spatial.distance.cosine`.
 # https://docs.scipy.org/doc/scipy-1.11.4/reference/generated/scipy.spatial.distance.cosine.html
@@ -170,7 +185,10 @@ def cosine(
     b: _BufferType,
     /,
     dtype: Optional[Union[_IntegralType, _FloatType]] = None,
-) -> Union[float, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType] = None,
+) -> Optional[Union[float, DistancesTensor]]: ...
 
 # ---------------------------------------------------------------------
 # Vector-vector similarity functions for binary vectors
@@ -183,7 +201,10 @@ def hamming(
     b: _BufferType,
     /,
     dtype: Optional[_IntegralType] = None,
-) -> Union[float, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType] = None,
+) -> Optional[Union[float, DistancesTensor]]: ...
 
 # Vector-vector Jaccard distance, similar to: `scipy.spatial.distance.jaccard`.
 # https://docs.scipy.org/doc/scipy-1.11.4/reference/generated/scipy.spatial.distance.jaccard.html
@@ -192,7 +213,10 @@ def jaccard(
     b: _BufferType,
     /,
     dtype: Optional[_IntegralType] = None,
-) -> Union[float, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType] = None,
+) -> Optional[Union[float, DistancesTensor]]: ...
 
 # ---------------------------------------------------------------------
 # Vector-vector similarity between probability distributions
@@ -205,7 +229,10 @@ def jensenshannon(
     b: _BufferType,
     /,
     dtype: Optional[_FloatType] = None,
-) -> Union[float, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType] = None,
+) -> Optional[Union[float, DistancesTensor]]: ...
 
 # Vector-vector Kullback-Leibler divergence, similar to: `scipy.spatial.distance.kullback_leibler`.
 # https://docs.scipy.org/doc/scipy-1.11.4/reference/generated/scipy.spatial.distance.kullback_leibler.html
@@ -214,7 +241,10 @@ def kullbackleibler(
     b: _BufferType,
     /,
     dtype: Optional[_FloatType] = None,
-) -> Union[float, DistancesTensor]: ...
+    *,
+    out: Optional[_BufferType] = None,
+    out_dtype: Union[_FloatType] = None,
+) -> Optional[Union[float, DistancesTensor]]: ...
 
 # ---------------------------------------------------------------------
 # Vector-vector similarity between vectors in curved spaces
@@ -247,3 +277,32 @@ def mahalanobis(
 # Vector-vector intersection similarity, similar to: `numpy.intersect1d`.
 # https://numpy.org/doc/stable/reference/generated/numpy.intersect1d.html
 def intersection(array1: _BufferType, array2: _BufferType, /) -> float: ...
+
+# ---------------------------------------------------------------------
+# Vector-vector math: FMA, WSum
+# ---------------------------------------------------------------------
+
+# Vector-vector element-wise fused-multiply add.
+def fma(
+    a: _BufferType,
+    b: _BufferType,
+    c: _BufferType,
+    /,
+    dtype: Optional[Union[_FloatType, _IntegralType]] = None,
+    *,
+    alpha: float = 1,
+    beta: float = 1,
+    out: Optional[_BufferType] = None,
+) -> Optional[DistancesTensor]: ...
+
+# Vector-vector element-wise weighted sum.
+def wum(
+    a: _BufferType,
+    b: _BufferType,
+    /,
+    dtype: Optional[Union[_FloatType, _IntegralType]] = None,
+    *,
+    alpha: float = 1,
+    beta: float = 1,
+    out: Optional[_BufferType] = None,
+) -> Optional[DistancesTensor]: ...