From c8e50063729f7114c126bf585fdcd7ac6faa3e85 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 17 Oct 2024 16:01:27 -0400
Subject: [PATCH] docs: add docs for loading packages

---
 src/api/activation.jl  | 2 +-
 src/api/batched_mul.jl | 5 +++++
 src/api/dense.jl       | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/api/activation.jl b/src/api/activation.jl
index 9ef1c544..df44aa0c 100644
--- a/src/api/activation.jl
+++ b/src/api/activation.jl
@@ -10,7 +10,7 @@ generic implementation.
     This function doesn't replace `σ` with `NNlib.fast_act(σ, ...)`, that needs to be
     done by the user if needed.
 
-!!! tip
+!!! tip "Load `SLEEFPirates.jl` to get faster activations"
 
     Certain activation functions are replaced with specialized implementations from
     [SLEEFPirates.jl](https://github.com/JuliaSIMD/SLEEFPirates.jl) for FP32. This might
diff --git a/src/api/batched_mul.jl b/src/api/batched_mul.jl
index a5d7b132..c6cb379a 100644
--- a/src/api/batched_mul.jl
+++ b/src/api/batched_mul.jl
@@ -4,6 +4,11 @@
 Computes the batched matrix multiplication of `x` and `y`.  For more details see the NNlib
 documentation on `NNlib.batched_mul`. This function is mostly a wrapper around `batched_mul`
 but attempts to be faster on CPUs.
+
+!!! tip "Load `LoopVectorization.jl` to get faster batched matrix multiplication"
+
+    On CPUs loading LoopVectorization adds faster implementations of batched matrix
+    multiplication.
 """
 function batched_matmul(x::AbstractMatrix, y::AbstractArray{yT, 3}) where {yT}
     return batched_matmul(expand_batchdim(x), y)
diff --git a/src/api/dense.jl b/src/api/dense.jl
index 0e83dac7..f51b2518 100644
--- a/src/api/dense.jl
+++ b/src/api/dense.jl
@@ -24,6 +24,11 @@ multiple operations.
   - For small CPU Arrays, we use LoopVectorization.jl. On `x86_64` we use Octavian for
     medium sized matrices. This is overridden if special BLAS implementations are loaded
     (currently `MKL`, `AppleAccelerate`, and `BLISBLAS`).
+
+!!! tip "Load `Octavian.jl`
+
+    Loading `Octavian.jl` enables a polyalgorithm that uses different backends based on the
+    input sizes.
 """
 function fused_dense_bias_activation(σ::F, weight::AbstractMatrix, x::AbstractMatrix,
         b::Optional{<:AbstractVector}) where {F}