From dc52b3eef73f8646c7a03893ad966d8efde5e9e3 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 26 Jun 2024 23:11:37 -0700
Subject: [PATCH] Finish rewriting the tests

---
 .buildkite/pipeline.yml            |   6 +-
 .github/workflows/CI.yml           |   2 +-
 .github/workflows/Downgrade.yml    |   2 +-
 .github/workflows/Downstream.yml   |   2 +-
 .github/workflows/FormatCheck.yml  |  40 ----
 .github/workflows/QualityCheck.yml |  19 ++
 Project.toml                       |   2 -
 README.md                          |   1 -
 ext/WeightInitializersCUDAExt.jl   |   3 +-
 src/initializers.jl                |  96 +++++-----
 test/initializers_tests.jl         | 267 +++++++++++++++++++++++++++
 test/qa_tests.jl                   |  23 +++
 test/runtests.jl                   | 287 +----------------------------
 test/shared_testsetup.jl           |  20 ++
 test/utils_tests.jl                |   9 +
 15 files changed, 395 insertions(+), 384 deletions(-)
 delete mode 100644 .github/workflows/FormatCheck.yml
 create mode 100644 .github/workflows/QualityCheck.yml
 create mode 100644 test/initializers_tests.jl
 create mode 100644 test/qa_tests.jl
 create mode 100644 test/shared_testsetup.jl
 create mode 100644 test/utils_tests.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index a625b0f..565e58f 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -16,7 +16,7 @@ steps:
           queue: "juliagpu"
           cuda: "*"
         env:
-          GROUP: "CUDA"
+          BACKEND_GROUP: "CUDA"
         if: build.message !~ /\[skip tests\]/
         timeout_in_minutes: 240
         matrix:
@@ -61,7 +61,7 @@ steps:
           queue: "juliagpu"
           cuda: "*"
         env:
-          GROUP: "CUDA"
+          BACKEND_GROUP: "CUDA"
           DOWNSTREAM_TEST_REPO: "{{matrix.repo}}"
         if: build.message !~ /\[skip tests\]/ || build.message !~ /\[skip downstream\]/
         timeout_in_minutes: 240
@@ -111,7 +111,7 @@ steps:
           rocm: "*"
           rocmgpu: "*"
         env:
-          GROUP: "AMDGPU"
+          BACKEND_GROUP: "AMDGPU"
           JULIA_AMDGPU_CORE_MUST_LOAD: "1"
           JULIA_AMDGPU_HIP_MUST_LOAD: "1"
           JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 2ad20de..6596d9d 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -37,7 +37,7 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         env:
-          GROUP: "CPU"
+          BACKEND_GROUP: "CPU"
           RETESTITEMS_NWORKERS: 4
           RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
index 269275e..5a5bcb1 100644
--- a/.github/workflows/Downgrade.yml
+++ b/.github/workflows/Downgrade.yml
@@ -27,7 +27,7 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         env:
-          GROUP: "CPU"
+          BACKEND_GROUP: "CPU"
           RETESTITEMS_NWORKERS: 4
           RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
index b215b2b..bf579cb 100644
--- a/.github/workflows/Downstream.yml
+++ b/.github/workflows/Downstream.yml
@@ -16,7 +16,7 @@ jobs:
     name: ${{ matrix.package.repo }}/${{ matrix.package.group }}
     runs-on: ${{ matrix.os }}
     env:
-      GROUP: ${{ matrix.package.group }}
+      BACKEND_GROUP: ${{ matrix.package.group }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/FormatCheck.yml b/.github/workflows/FormatCheck.yml
deleted file mode 100644
index ac75c52..0000000
--- a/.github/workflows/FormatCheck.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: FormatCheck
-
-on:
-  push:
-    branches:
-      - 'main'
-      - 'release-'
-    tags: ['*']
-  pull_request:
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        julia-version: ["1"]
-        julia-arch: [x86]
-        os: [ubuntu-latest]
-    steps:
-      - uses: julia-actions/setup-julia@latest
-        with:
-          version: ${{ matrix.julia-version }}
-
-      - uses: actions/checkout@v4
-      - name: Install JuliaFormatter and format
-        run: |
-          julia  -e 'using Pkg; Pkg.add(PackageSpec(name="JuliaFormatter"))'
-          julia  -e 'using JuliaFormatter; format(".", verbose=true)'
-      - name: Format check
-        run: |
-          julia -e '
-          out = Cmd(`git diff --name-only`) |> read |> String
-          if out == ""
-              exit(0)
-          else
-              @error "Some files have not been formatted !!!"
-              write(stdout, out)
-              exit(1)
-          end'
-      
\ No newline at end of file
diff --git a/.github/workflows/QualityCheck.yml b/.github/workflows/QualityCheck.yml
new file mode 100644
index 0000000..3bfa611
--- /dev/null
+++ b/.github/workflows/QualityCheck.yml
@@ -0,0 +1,19 @@
+name: Code Quality Check
+
+on: [pull_request]
+
+jobs:
+  code-style:
+    name: Format Suggestions
+    runs-on: ubuntu-latest
+    steps:
+      - uses: julia-actions/julia-format@v3
+
+  typos-check:
+    name: Spell Check with Typos
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Actions Repository
+        uses: actions/checkout@v4
+      - name: Check spelling
+        uses: crate-ci/typos@v1.22.9
diff --git a/Project.toml b/Project.toml
index be3e84a..6981002 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,7 +4,6 @@ authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
 version = "0.1.8"
 
 [deps]
-ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -21,7 +20,6 @@ WeightInitializersCUDAExt = "CUDA"
 
 [compat]
 Aqua = "0.8.7"
-ArgCheck = "2.3.0"
 CUDA = "5.3.2"
 ChainRulesCore = "1.23"
 Documenter = "1.5.0"
diff --git a/README.md b/README.md
index edede1c..4dc182c 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,6 @@
 [![Build status](https://badge.buildkite.com/ffa2c8c3629cd58322446cddd3e8dcc4f121c28a574ee3e626.svg?branch=main)](https://buildkite.com/julialang/weightinitializers-dot-jl)
 [![CI](https://github.com/LuxDL/WeightInitializers.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/LuxDL/WeightInitializers.jl/actions/workflows/CI.yml)
 [![codecov](https://codecov.io/gh/LuxDL/WeightInitializers.jl/branch/main/graph/badge.svg?token=1ZY0A2NPEM)](https://codecov.io/gh/LuxDL/WeightInitializers.jl)
-[![Package Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/WeightInitializers)](https://pkgs.genieframework.com?packages=WeightInitializers)
 
 [![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor's%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
 [![SciML Code Style](https://img.shields.io/static/v1?label=code%20style&message=SciML&color=9558b2&labelColor=389826)](https://github.com/SciML/SciMLStyle)
diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl
index e97f268..ac2d391 100644
--- a/ext/WeightInitializersCUDAExt.jl
+++ b/ext/WeightInitializersCUDAExt.jl
@@ -1,8 +1,7 @@
 module WeightInitializersCUDAExt
 
 using CUDA: CUDA, CURAND
-using Random: Random, shuffle
-using WeightInitializers: WeightInitializers, NUM_TO_FPOINT, __partial_apply
+using WeightInitializers: WeightInitializers
 
 const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG}
 
diff --git a/src/initializers.jl b/src/initializers.jl
index 7877d2b..2a5e4c8 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -104,7 +104,8 @@ truncated normal distribution. The numbers are distributed like
 function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T(0),
         std=T(1), lo=-T(2), hi=T(2)) where {T <: Real}
     if (mean < lo - 2 * std) || (mean > hi + 2 * std)
-        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate."
+        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the \
+               distribution of values may be inaccurate."
     end
     l = _norm_cdf((T(lo) - T(mean)) / T(std))
     u = _norm_cdf((T(hi) - T(mean)) / T(std))
@@ -122,13 +123,12 @@ end
         gain = 1)  -> AbstractArray{T, length(dims)}
 
 Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a
-(semi) orthogonal matrix, as described in [^Saxe14]
+(semi) orthogonal matrix, as described in [1].
 
 The function constructs an orthogonal or semi-orthogonal matrix depending on the specified
-dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`.
-For more than two dimensions, it computes an orthogonal matrix of
-size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to
-the original dimensions.
+dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`. For more
+than two dimensions, it computes an orthogonal matrix of size `prod(dims[1:(end - 1)])` by
+`dims[end]` before reshaping it to the original dimensions.
 
 Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
@@ -141,9 +141,8 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
 # References
 
-[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of
-learning in deep linear neural networks",
-ICLR 2014, https://arxiv.org/abs/1312.6120
+[1] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in
+deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
 """
 function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         gain::Number=T(1.0)) where {T <: Number}
@@ -164,12 +163,15 @@ end
         sparsity::Number, std::Number=0.01) -> AbstractArray{T}
 
 Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements,
-using random numbers drawn from a normal distribution for the non-zero elements.
-This method is introduced in [^Martens2010].
-Note: The sparsity parameter controls the proportion of the matrix that will be zeroed.
-For example, a sparsity of 0.3 means that approximately 30% of the elements will be
-set to zero. The non-zero elements are distributed according to a normal distribution,
-scaled by the std parameter.
+using random numbers drawn from a normal distribution for the non-zero elements. This method
+was introduced in [1].
+
+!!! note
+
+    The sparsity parameter controls the proportion of the matrix that will be zeroed. For
+    example, a sparsity of 0.3 means that approximately 30% of the elements will be set to
+    zero. The non-zero elements are distributed according to a normal distribution, scaled
+    by the std parameter.
 
 # Arguments
 
@@ -177,43 +179,36 @@ scaled by the std parameter.
   - `T::Type{<:Number}`: The numeric type of the elements in the returned array.
   - `dims::Integer...`: The dimensions of the weight matrix to be generated.
   - `sparsity::Number`: The proportion of elements to be zeroed. Must be between 0 and 1.
-  - `std::Number=0.01`: The standard deviation of the normal distribution
-    before applying `gain`.
+  - `std::Number=0.01`: The standard deviation of the normal distribution before applying
+    `gain`.
 
 # Returns
 
-  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims`
-    and type `T`.
+  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims` and type
+    `T`.
 
 # Examples
 
-```julia
-using Random
+```jldoctest
+julia> y = sparse_init(Xoshiro(123), Float32, 5, 5; sparsity=0.3, std=0.01);
 
-# Initialize a 5x5 sparsely initialized matrix with 30% sparsity
-rng = MersenneTwister(123)
-matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01)
-```
+julia> y isa Matrix{Float32}
+true
 
-```
-5×5 Matrix{Float64}:
-  0.0          0.00273815    0.00592403   0.0          0.0
-  0.00459416  -0.000754831  -0.00888936  -0.0077507    0.0
-  0.0         -0.00194229    0.0          0.0         -0.00468489
-  0.0114265    0.0           0.0         -0.00734886   0.00277726
- -0.00396679   0.0           0.00327215  -0.0071741   -0.00880897
+julia> size(y) == (5, 5)
+true
 ```
 
 # References
 
-[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization"
-_Proceedings of the 27th International Conference on International Conference
-on Machine Learning_. 2010.
+[1] Martens, J, "Deep learning via Hessian-free optimization" Proceedings of the 27th
+International Conference on International Conference on Machine Learning. 2010.
 """
 function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         sparsity::Number, std::Number=T(0.01)) where {T <: Number}
     if length(dims) != 2
-        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse \
+                             initialization."))
     end
 
     rows, cols = dims
@@ -250,8 +245,8 @@ most layers of a neural network. The identity mapping is scaled by the `gain` pa
   - Layers must have `input_size == output_size` for a perfect identity mapping.
     In cases where this condition is not met, the function pads extra dimensions with zeros.
   - For convolutional layers to achieve an identity mapping, kernel sizes must be odd,
-    and appropriate padding must be applied to ensure the output
-    feature maps are the same size as the input feature maps.
+    and appropriate padding must be applied to ensure the output feature maps are the same
+    size as the input feature maps.
 
 # Arguments
 
@@ -271,16 +266,21 @@ most layers of a neural network. The identity mapping is scaled by the `gain` pa
 
 # Examples
 
-```julia
-using Random
-
-# Identity matrix for fully connected layer
-identity_matrix = identity_init(MersenneTwister(123), Float32, 5, 5)
-
-# Identity tensor for convolutional layer
-identity_tensor = identity_init(MersenneTwister(123), Float32,        # Bias initialization
-    3, 3, 5,        # Matrix multiplication
-    5; gain=1.5, shift=(1, 0))
+```jldoctest
+julia> identity_init(Xoshiro(123), Float32, 5, 5)
+5×5 Matrix{Float32}:
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+
+julia> identity_init(Xoshiro(123), Float32, 3, 3, 1, 1; gain=1.5)
+3×3×1×1 Array{Float32, 4}:
+[:, :, 1, 1] =
+ 0.0  0.0  0.0
+ 0.0  1.5  0.0
+ 0.0  0.0  0.0
 ```
 """
 function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
diff --git a/test/initializers_tests.jl b/test/initializers_tests.jl
new file mode 100644
index 0000000..202e10d
--- /dev/null
+++ b/test/initializers_tests.jl
@@ -0,0 +1,267 @@
+@testitem "Warning: truncated_normal" begin
+    @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so \
+        the distribution of values may be inaccurate." truncated_normal(2; mean=-5.0f0)
+end
+
+@testitem "Identity Initialization" begin
+    @testset "Non-identity sizes" begin
+        @test identity_init(2, 3)[:, end] == zeros(Float32, 2)
+        @test identity_init(3, 2; shift=1)[1, :] == zeros(Float32, 2)
+        @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3)
+        @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3)
+        @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3)
+    end
+end
+
+@testitem "Orthogonal Initialization" setup=[SharedTestSetup] begin
+    using GPUArraysCore, LinearAlgebra
+
+    @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in RNGS_ARRTYPES
+        # A matrix of dim = (m,n) with m > n should produce a QR decomposition.
+        # In the other case, the transpose should be taken to compute the QR decomposition.
+        for (rows, cols) in [(5, 3), (3, 5)]
+            v = orthogonal(rng, rows, cols)
+            GPUArraysCore.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) :
+                                       (@test v' * v ≈ I(cols))
+        end
+
+        for mat in [(3, 4, 5), (2, 2, 5)]
+            v = orthogonal(rng, mat...)
+            cols = mat[end]
+            rows = div(prod(mat), cols)
+            v = reshape(v, (rows, cols))
+            GPUArraysCore.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) :
+                                       (@test v' * v ≈ I(cols))
+        end
+
+        @testset "Orthogonal Types $T" for T in (Float32, Float64)
+            @test eltype(orthogonal(rng, T, 3, 4; gain=1.5)) == T
+            @test eltype(orthogonal(rng, T, 3, 4, 5; gain=1.5)) == T
+        end
+
+        @testset "Orthogonal AbstractArray Type $T" for T in (Float32, Float64)
+            @test orthogonal(rng, T, 3, 5) isa AbstractArray{T, 2}
+            @test orthogonal(rng, T, 3, 5) isa arrtype{T, 2}
+
+            cl = orthogonal(rng)
+            @test cl(T, 3, 5) isa arrtype{T, 2}
+
+            cl = orthogonal(rng, T)
+            @test cl(3, 5) isa arrtype{T, 2}
+        end
+
+        @testset "Orthogonal Closure" begin
+            cl = orthogonal(;)
+
+            # Sizes
+            @test size(cl(3, 4)) == (3, 4)
+            @test size(cl(rng, 3, 4)) == (3, 4)
+            @test size(cl(3, 4, 5)) == (3, 4, 5)
+            @test size(cl(rng, 3, 4, 5)) == (3, 4, 5)
+
+            # Type
+            @test eltype(cl(4, 2)) == Float32
+            @test eltype(cl(rng, 4, 2)) == Float32
+        end
+    end
+end
+
+@testitem "Sparse Initialization" setup=[SharedTestSetup] begin
+    using Statistics
+
+    @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in RNGS_ARRTYPES
+        # sparse_init should yield an error for non 2-d dimensions
+        # sparse_init should yield no zero elements if sparsity < 0
+        # sparse_init should yield all zero elements if sparsity > 1
+        # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for
+        # other sparsity values
+        # sparse_init should yield a kernel in its non-zero elements consistent with the std
+        # parameter
+
+        @test_throws ArgumentError sparse_init(3, 4, 5, sparsity=0.1)
+        @test_throws ArgumentError sparse_init(3, sparsity=0.1)
+        v = sparse_init(100, 100; sparsity=-0.1)
+        @test sum(v .== 0) == 0
+        v = sparse_init(100, 100; sparsity=1.1)
+        @test sum(v .== 0) == length(v)
+
+        for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
+            expected_zeros = ceil(Integer, n_in * sparsity)
+            v = sparse_init(n_in, n_out; sparsity=sparsity, std=σ)
+            @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out])
+            @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
+        end
+
+        @testset "sparse_init Types $T" for T in (Float16, Float32, Float64)
+            @test eltype(sparse_init(rng, T, 3, 4; sparsity=0.5)) == T
+        end
+
+        @testset "sparse_init AbstractArray Type $T" for T in (Float16, Float32, Float64)
+            @test sparse_init(T, 3, 5; sparsity=0.5) isa AbstractArray{T, 2}
+            @test sparse_init(rng, T, 3, 5; sparsity=0.5) isa arrtype{T, 2}
+
+            cl = sparse_init(rng; sparsity=0.5)
+            @test cl(T, 3, 5) isa arrtype{T, 2}
+
+            cl = sparse_init(rng, T; sparsity=0.5)
+            @test cl(3, 5) isa arrtype{T, 2}
+        end
+
+        @testset "sparse_init Closure" begin
+            cl = sparse_init(; sparsity=0.5)
+            # Sizes
+            @test size(cl(3, 4)) == (3, 4)
+            @test size(cl(rng, 3, 4)) == (3, 4)
+            # Type
+            @test eltype(cl(4, 2)) == Float32
+            @test eltype(cl(rng, 4, 2)) == Float32
+        end
+    end
+end
+
+@testitem "Basic Initializations" setup=[SharedTestSetup] begin
+    using LinearAlgebra, Statistics
+
+    @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in RNGS_ARRTYPES
+        @testset "Sizes and Types: $init" for init in [
+            zeros32, ones32, rand32, randn32, kaiming_uniform, kaiming_normal,
+            glorot_uniform, glorot_normal, truncated_normal, identity_init]
+            # Sizes
+            @test size(init(3)) == (3,)
+            @test size(init(rng, 3)) == (3,)
+            @test size(init(3, 4)) == (3, 4)
+            @test size(init(rng, 3, 4)) == (3, 4)
+            @test size(init(3, 4, 5)) == (3, 4, 5)
+            @test size(init(rng, 3, 4, 5)) == (3, 4, 5)
+            # Type
+            @test eltype(init(rng, 4, 2)) == Float32
+            @test eltype(init(4, 2)) == Float32
+            # RNG Closure
+            cl = init(rng)
+            @test cl(3) isa arrtype{Float32, 1}
+            @test cl(3, 5) isa arrtype{Float32, 2}
+        end
+
+        @testset "Sizes and Types: $init" for (init, fp) in [
+            (zeros16, Float16), (zerosC16, ComplexF16), (zeros32, Float32),
+            (zerosC32, ComplexF32), (zeros64, Float64), (zerosC64, ComplexF64),
+            (ones16, Float16), (onesC16, ComplexF16), (ones32, Float32),
+            (onesC32, ComplexF32), (ones64, Float64), (onesC64, ComplexF64),
+            (rand16, Float16), (randC16, ComplexF16), (rand32, Float32),
+            (randC32, ComplexF32), (rand64, Float64), (randC64, ComplexF64),
+            (randn16, Float16), (randnC16, ComplexF16), (randn32, Float32),
+            (randnC32, ComplexF32), (randn64, Float64), (randnC64, ComplexF64)]
+            # Sizes
+            @test size(init(3)) == (3,)
+            @test size(init(rng, 3)) == (3,)
+            @test size(init(3, 4)) == (3, 4)
+            @test size(init(rng, 3, 4)) == (3, 4)
+            @test size(init(3, 4, 5)) == (3, 4, 5)
+            @test size(init(rng, 3, 4, 5)) == (3, 4, 5)
+            # Type
+            @test eltype(init(rng, 4, 2)) == fp
+            @test eltype(init(4, 2)) == fp
+            # RNG Closure
+            cl = init(rng)
+            @test cl(3) isa arrtype{fp, 1}
+            @test cl(3, 5) isa arrtype{fp, 2}
+        end
+
+        @testset "AbstractArray Type: $init $T" for init in [
+                kaiming_uniform, kaiming_normal, glorot_uniform,
+                glorot_normal, truncated_normal, identity_init],
+            T in (Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64)
+
+            init === truncated_normal && !(T <: Real) && continue
+
+            @test init(T, 3) isa AbstractArray{T, 1}
+            @test init(rng, T, 3) isa arrtype{T, 1}
+            @test init(T, 3, 5) isa AbstractArray{T, 2}
+            @test init(rng, T, 3, 5) isa arrtype{T, 2}
+
+            cl = init(rng)
+            @test cl(T, 3) isa arrtype{T, 1}
+            @test cl(T, 3, 5) isa arrtype{T, 2}
+
+            cl = init(rng, T)
+            @test cl(3) isa arrtype{T, 1}
+            @test cl(3, 5) isa arrtype{T, 2}
+        end
+
+        @testset "Closure: $init" for init in [
+            kaiming_uniform, kaiming_normal, glorot_uniform,
+            glorot_normal, truncated_normal, identity_init]
+            cl = init(;)
+            # Sizes
+            @test size(cl(3)) == (3,)
+            @test size(cl(rng, 3)) == (3,)
+            @test size(cl(3, 4)) == (3, 4)
+            @test size(cl(rng, 3, 4)) == (3, 4)
+            @test size(cl(3, 4, 5)) == (3, 4, 5)
+            @test size(cl(rng, 3, 4, 5)) == (3, 4, 5)
+            # Type
+            @test eltype(cl(4, 2)) == Float32
+            @test eltype(cl(rng, 4, 2)) == Float32
+        end
+
+        @testset "Kwargs types" for T in (
+            Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64)
+            if (T <: Real)
+                @test eltype(truncated_normal(T, 2, 5; mean=0, std=1, lo=-2, hi=2)) == T
+                @test eltype(orthogonal(T, 2, 5; gain=1.0)) == T
+            end
+            @test eltype(glorot_uniform(T, 2, 5; gain=1.0)) == T
+            @test eltype(glorot_normal(T, 2, 5; gain=1.0)) == T
+            @test eltype(kaiming_uniform(T, 2, 5; gain=sqrt(2))) == T
+            @test eltype(kaiming_normal(T, 2, 5; gain=sqrt(2))) == T
+            @test eltype(identity_init(T, 2, 5; gain=1.0)) == T
+            @test eltype(sparse_init(T, 2, 5; sparsity=0.5, std=0.01)) == T
+        end
+
+        @testset "kaiming" begin
+            # kaiming_uniform should yield a kernel in range [-sqrt(6/n_out), sqrt(6/n_out)]
+            # and kaiming_normal should yield a kernel with stddev ~= sqrt(2/n_out)
+            for (n_in, n_out) in [(100, 100), (100, 400)]
+                v = kaiming_uniform(rng, n_in, n_out)
+                σ2 = sqrt(6 / n_out)
+                @test -1σ2 < minimum(v) < -0.9σ2
+                @test 0.9σ2 < maximum(v) < 1σ2
+
+                v = kaiming_normal(rng, n_in, n_out)
+                σ2 = sqrt(2 / n_out)
+                @test 0.9σ2 < std(v) < 1.1σ2
+            end
+            # Type
+            @test eltype(kaiming_uniform(rng, 3, 4; gain=1.5f0)) == Float32
+            @test eltype(kaiming_normal(rng, 3, 4; gain=1.5f0)) == Float32
+        end
+
+        @testset "glorot: $init" for init in [glorot_uniform, glorot_normal]
+            # glorot_uniform and glorot_normal should both yield a kernel with
+            # variance ≈ 2/(fan_in + fan_out)
+            for dims in [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+                v = init(dims...)
+                fan_in, fan_out = WeightInitializers._nfan(dims...)
+                σ2 = 2 / (fan_in + fan_out)
+                @test 0.9σ2 < var(v) < 1.1σ2
+            end
+            @test eltype(init(3, 4; gain=1.5)) == Float32
+        end
+
+        @testset "orthogonal" begin
+            # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition.
+            for (rows, cols) in [(5, 3), (3, 5)]
+                v = orthogonal(rows, cols)
+                rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+            end
+            for mat in [(3, 4, 5), (2, 2, 5)]
+                v = orthogonal(mat...)
+                cols = mat[end]
+                rows = div(prod(mat), cols)
+                v = reshape(v, (rows, cols))
+                rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+            end
+            @test eltype(orthogonal(3, 4; gain=1.5)) == Float32
+        end
+    end
+end
diff --git a/test/qa_tests.jl b/test/qa_tests.jl
new file mode 100644
index 0000000..c5c93c2
--- /dev/null
+++ b/test/qa_tests.jl
@@ -0,0 +1,23 @@
+@testitem "Aqua: Quality Assurance" begin
+    using Aqua
+
+    Aqua.test_all(WeightInitializers; ambiguities=false)
+    Aqua.test_ambiguities(WeightInitializers; recursive=false)
+end
+
+@testitem "Explicit Imports: Quality Assurance" setup=[SharedTestSetup] begin
+    using CUDA, ExplicitImports
+
+    @test check_no_implicit_imports(WeightInitializers) === nothing
+    @test check_no_stale_explicit_imports(WeightInitializers) === nothing
+    @test check_no_self_qualified_accesses(WeightInitializers) === nothing
+end
+
+@testitem "doctests: Quality Assurance" begin
+    using Documenter
+
+    doctestexpr = :(using Random, WeightInitializers)
+
+    DocMeta.setdocmeta!(WeightInitializers, :DocTestSetup, doctestexpr; recursive=true)
+    doctest(WeightInitializers; manual=false)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index a620753..8ba7978 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,286 +1,3 @@
-using Aqua
-using WeightInitializers, Test, Statistics
-using StableRNGs, Random, CUDA, LinearAlgebra
+using ReTestItems
 
-CUDA.allowscalar(false)
-
-const GROUP = get(ENV, "GROUP", "All")
-
-@testset "WeightInitializers.jl Tests" begin
-    rngs_arrtypes = []
-
-    if GROUP == "All" || GROUP == "CPU"
-        append!(rngs_arrtypes,
-            [(StableRNG(12345), AbstractArray), (Random.default_rng(), AbstractArray)])
-    end
-
-    if GROUP == "All" || GROUP == "CUDA"
-        append!(rngs_arrtypes, [(CUDA.default_rng(), CuArray)])
-    end
-
-    @testset "_nfan" begin
-        # Fallback
-        @test WeightInitializers._nfan() == (1, 1)
-        # Vector
-        @test WeightInitializers._nfan(4) == (1, 4)
-        # Matrix
-        @test WeightInitializers._nfan(4, 5) == (5, 4)
-        # Tuple
-        @test WeightInitializers._nfan((4, 5, 6)) == WeightInitializers._nfan(4, 5, 6)
-        # Convolution
-        @test WeightInitializers._nfan(4, 5, 6) == 4 .* (5, 6)
-    end
-
-    @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
-        @testset "Sizes and Types: $init" for init in [
-            zeros32, ones32, rand32, randn32, kaiming_uniform, kaiming_normal,
-            glorot_uniform, glorot_normal, truncated_normal, identity_init]
-            # Sizes
-            @test size(init(3)) == (3,)
-            @test size(init(rng, 3)) == (3,)
-            @test size(init(3, 4)) == (3, 4)
-            @test size(init(rng, 3, 4)) == (3, 4)
-            @test size(init(3, 4, 5)) == (3, 4, 5)
-            @test size(init(rng, 3, 4, 5)) == (3, 4, 5)
-            # Type
-            @test eltype(init(rng, 4, 2)) == Float32
-            @test eltype(init(4, 2)) == Float32
-            # RNG Closure
-            cl = init(rng)
-            @test cl(3) isa arrtype{Float32, 1}
-            @test cl(3, 5) isa arrtype{Float32, 2}
-        end
-
-        @testset "Sizes and Types: $init" for (init, fp) in [
-            (zeros16, Float16), (zerosC16, ComplexF16), (zeros32, Float32),
-            (zerosC32, ComplexF32), (zeros64, Float64), (zerosC64, ComplexF64),
-            (ones16, Float16), (onesC16, ComplexF16), (ones32, Float32),
-            (onesC32, ComplexF32), (ones64, Float64), (onesC64, ComplexF64),
-            (rand16, Float16), (randC16, ComplexF16), (rand32, Float32),
-            (randC32, ComplexF32), (rand64, Float64), (randC64, ComplexF64),
-            (randn16, Float16), (randnC16, ComplexF16), (randn32, Float32),
-            (randnC32, ComplexF32), (randn64, Float64), (randnC64, ComplexF64)]
-            # Sizes
-            @test size(init(3)) == (3,)
-            @test size(init(rng, 3)) == (3,)
-            @test size(init(3, 4)) == (3, 4)
-            @test size(init(rng, 3, 4)) == (3, 4)
-            @test size(init(3, 4, 5)) == (3, 4, 5)
-            @test size(init(rng, 3, 4, 5)) == (3, 4, 5)
-            # Type
-            @test eltype(init(rng, 4, 2)) == fp
-            @test eltype(init(4, 2)) == fp
-            # RNG Closure
-            cl = init(rng)
-            @test cl(3) isa arrtype{fp, 1}
-            @test cl(3, 5) isa arrtype{fp, 2}
-        end
-
-        @testset "AbstractArray Type: $init $T" for init in [
-                kaiming_uniform, kaiming_normal, glorot_uniform,
-                glorot_normal, truncated_normal, identity_init],
-            T in (Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64)
-
-            init === truncated_normal && !(T <: Real) && continue
-
-            @test init(T, 3) isa AbstractArray{T, 1}
-            @test init(rng, T, 3) isa arrtype{T, 1}
-            @test init(T, 3, 5) isa AbstractArray{T, 2}
-            @test init(rng, T, 3, 5) isa arrtype{T, 2}
-
-            cl = init(rng)
-            @test cl(T, 3) isa arrtype{T, 1}
-            @test cl(T, 3, 5) isa arrtype{T, 2}
-
-            cl = init(rng, T)
-            @test cl(3) isa arrtype{T, 1}
-            @test cl(3, 5) isa arrtype{T, 2}
-        end
-
-        @testset "Closure: $init" for init in [
-            kaiming_uniform, kaiming_normal, glorot_uniform,
-            glorot_normal, truncated_normal, identity_init]
-            cl = init(;)
-            # Sizes
-            @test size(cl(3)) == (3,)
-            @test size(cl(rng, 3)) == (3,)
-            @test size(cl(3, 4)) == (3, 4)
-            @test size(cl(rng, 3, 4)) == (3, 4)
-            @test size(cl(3, 4, 5)) == (3, 4, 5)
-            @test size(cl(rng, 3, 4, 5)) == (3, 4, 5)
-            # Type
-            @test eltype(cl(4, 2)) == Float32
-            @test eltype(cl(rng, 4, 2)) == Float32
-        end
-
-        @testset "Kwargs types" for T in (
-            Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64)
-            if (T <: Real)
-                @test eltype(truncated_normal(T, 2, 5; mean=0, std=1, lo=-2, hi=2)) == T
-                @test eltype(orthogonal(T, 2, 5; gain=1.0)) == T
-            end
-            @test eltype(glorot_uniform(T, 2, 5; gain=1.0)) == T
-            @test eltype(glorot_normal(T, 2, 5; gain=1.0)) == T
-            @test eltype(kaiming_uniform(T, 2, 5; gain=sqrt(2))) == T
-            @test eltype(kaiming_normal(T, 2, 5; gain=sqrt(2))) == T
-            @test eltype(identity_init(T, 2, 5; gain=1.0)) == T
-            @test eltype(sparse_init(T, 2, 5; sparsity=0.5, std=0.01)) == T
-        end
-
-        @testset "kaiming" begin
-            # kaiming_uniform should yield a kernel in range [-sqrt(6/n_out), sqrt(6/n_out)]
-            # and kaiming_normal should yield a kernel with stddev ~= sqrt(2/n_out)
-            for (n_in, n_out) in [(100, 100), (100, 400)]
-                v = kaiming_uniform(rng, n_in, n_out)
-                σ2 = sqrt(6 / n_out)
-                @test -1σ2 < minimum(v) < -0.9σ2
-                @test 0.9σ2 < maximum(v) < 1σ2
-
-                v = kaiming_normal(rng, n_in, n_out)
-                σ2 = sqrt(2 / n_out)
-                @test 0.9σ2 < std(v) < 1.1σ2
-            end
-            # Type
-            @test eltype(kaiming_uniform(rng, 3, 4; gain=1.5f0)) == Float32
-            @test eltype(kaiming_normal(rng, 3, 4; gain=1.5f0)) == Float32
-        end
-
-        @testset "glorot: $init" for init in [glorot_uniform, glorot_normal]
-            # glorot_uniform and glorot_normal should both yield a kernel with
-            # variance ≈ 2/(fan_in + fan_out)
-            for dims in [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
-                v = init(dims...)
-                fan_in, fan_out = WeightInitializers._nfan(dims...)
-                σ2 = 2 / (fan_in + fan_out)
-                @test 0.9σ2 < var(v) < 1.1σ2
-            end
-            @test eltype(init(3, 4; gain=1.5)) == Float32
-        end
-
-        @testset "orthogonal" begin
-            # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition.
-            for (rows, cols) in [(5, 3), (3, 5)]
-                v = orthogonal(rows, cols)
-                rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
-            end
-            for mat in [(3, 4, 5), (2, 2, 5)]
-                v = orthogonal(mat...)
-                cols = mat[end]
-                rows = div(prod(mat), cols)
-                v = reshape(v, (rows, cols))
-                rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
-            end
-            @test eltype(orthogonal(3, 4; gain=1.5)) == Float32
-        end
-    end
-
-    @testset "Orthogonal rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
-        # A matrix of dim = (m,n) with m > n should produce a QR decomposition.
-        # In the other case, the transpose should be taken to compute the QR decomposition.
-        for (rows, cols) in [(5, 3), (3, 5)]
-            v = orthogonal(rng, rows, cols)
-            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) :
-                              (@test v' * v ≈ I(cols))
-        end
-        for mat in [(3, 4, 5), (2, 2, 5)]
-            v = orthogonal(rng, mat...)
-            cols = mat[end]
-            rows = div(prod(mat), cols)
-            v = reshape(v, (rows, cols))
-            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) :
-                              (@test v' * v ≈ I(cols))
-        end
-        # Type
-        @testset "Orthogonal Types $T" for T in (Float32, Float64)#(Float16, Float32, Float64)
-            @test eltype(orthogonal(rng, T, 3, 4; gain=1.5)) == T
-            @test eltype(orthogonal(rng, T, 3, 4, 5; gain=1.5)) == T
-        end
-        @testset "Orthogonal AbstractArray Type $T" for T in (Float32, Float64)#(Float16, Float32, Float64)
-            @test orthogonal(T, 3, 5) isa AbstractArray{T, 2}
-            @test orthogonal(rng, T, 3, 5) isa arrtype{T, 2}
-
-            cl = orthogonal(rng)
-            @test cl(T, 3, 5) isa arrtype{T, 2}
-
-            cl = orthogonal(rng, T)
-            @test cl(3, 5) isa arrtype{T, 2}
-        end
-        @testset "Orthogonal Closure" begin
-            cl = orthogonal(;)
-            # Sizes
-            @test size(cl(3, 4)) == (3, 4)
-            @test size(cl(rng, 3, 4)) == (3, 4)
-            @test size(cl(3, 4, 5)) == (3, 4, 5)
-            @test size(cl(rng, 3, 4, 5)) == (3, 4, 5)
-            # Type
-            @test eltype(cl(4, 2)) == Float32
-            @test eltype(cl(rng, 4, 2)) == Float32
-        end
-    end
-
-    @testset "sparse_init rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
-        # sparse_init should yield an error for non 2-d dimensions
-        # sparse_init should yield no zero elements if sparsity < 0
-        # sparse_init should yield all zero elements if sparsity > 1
-        # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values
-        # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter
-
-        @test_throws ArgumentError sparse_init(3, 4, 5, sparsity=0.1)
-        @test_throws ArgumentError sparse_init(3, sparsity=0.1)
-        v = sparse_init(100, 100; sparsity=-0.1)
-        @test sum(v .== 0) == 0
-        v = sparse_init(100, 100; sparsity=1.1)
-        @test sum(v .== 0) == length(v)
-
-        for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
-            expected_zeros = ceil(Integer, n_in * sparsity)
-            v = sparse_init(n_in, n_out; sparsity=sparsity, std=σ)
-            @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out])
-            @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
-        end
-
-        # Type
-        @testset "sparse_init Types $T" for T in (Float16, Float32, Float64)
-            @test eltype(sparse_init(rng, T, 3, 4; sparsity=0.5)) == T
-        end
-        @testset "sparse_init AbstractArray Type $T" for T in (Float16, Float32, Float64)
-            @test sparse_init(T, 3, 5; sparsity=0.5) isa AbstractArray{T, 2}
-            @test sparse_init(rng, T, 3, 5; sparsity=0.5) isa arrtype{T, 2}
-
-            cl = sparse_init(rng; sparsity=0.5)
-            @test cl(T, 3, 5) isa arrtype{T, 2}
-
-            cl = sparse_init(rng, T; sparsity=0.5)
-            @test cl(3, 5) isa arrtype{T, 2}
-        end
-        @testset "sparse_init Closure" begin
-            cl = sparse_init(; sparsity=0.5)
-            # Sizes
-            @test size(cl(3, 4)) == (3, 4)
-            @test size(cl(rng, 3, 4)) == (3, 4)
-            # Type
-            @test eltype(cl(4, 2)) == Float32
-            @test eltype(cl(rng, 4, 2)) == Float32
-        end
-    end
-
-    @testset "identity_init" begin
-        @testset "Non-identity sizes" begin
-            @test identity_init(2, 3)[:, end] == zeros(Float32, 2)
-            @test identity_init(3, 2; shift=1)[1, :] == zeros(Float32, 2)
-            @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3)
-            @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3)
-            @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3)
-        end
-    end
-
-    @testset "Warning: truncated_normal" begin
-        @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so \
-            the distribution of values may be inaccurate." truncated_normal(2; mean=-5.0f0)
-    end
-
-    @testset "Aqua: Quality Assurance" begin
-        Aqua.test_all(WeightInitializers; ambiguities=false)
-        Aqua.test_ambiguities(WeightInitializers; recursive=false)
-    end
-end
+ReTestItems.runtests(@__DIR__)
diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl
new file mode 100644
index 0000000..5b18e59
--- /dev/null
+++ b/test/shared_testsetup.jl
@@ -0,0 +1,20 @@
+@testsetup module SharedTestSetup
+
+using CUDA, Random, StableRNGs
+
+CUDA.allowscalar(false)
+
+const BACKEND_GROUP = lowercase(get(ENV, "BACKEND_GROUP", "All"))
+
+RNGS_ARRTYPES = []
+if BACKEND_GROUP == "all" || BACKEND_GROUP == "cpu"
+    append!(RNGS_ARRTYPES,
+        [(StableRNG(12345), AbstractArray), (Random.GLOBAL_RNG, AbstractArray)])
+end
+if BACKEND_GROUP == "all" || BACKEND_GROUP == "cuda"
+    push!(RNGS_ARRTYPES, (CUDA.default_rng(), CuArray))
+end
+
+export StableRNG, RNGS_ARRTYPES
+
+end
diff --git a/test/utils_tests.jl b/test/utils_tests.jl
new file mode 100644
index 0000000..c6c2b62
--- /dev/null
+++ b/test/utils_tests.jl
@@ -0,0 +1,9 @@
+@testitem "_nfan" begin
+    using WeightInitializers: _nfan
+
+    @test _nfan() == (1, 1) # Fallback
+    @test _nfan(4) == (1, 4) # Vector
+    @test _nfan(4, 5) == (5, 4) # Matrix
+    @test _nfan((4, 5, 6)) == _nfan(4, 5, 6) # Tuple
+    @test _nfan(4, 5, 6) == 4 .* (5, 6) # Convolution
+end