Finish rewriting the tests

LuxDL · Jun 27, 2024 · dc52b3e · dc52b3e
1 parent e74b058
commit dc52b3e
Show file tree

Hide file tree

Showing 15 changed files with 395 additions and 384 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -16,7 +16,7 @@ steps:
           queue: "juliagpu"
           cuda: "*"
         env:
-          GROUP: "CUDA"
+          BACKEND_GROUP: "CUDA"
         if: build.message !~ /\[skip tests\]/
         timeout_in_minutes: 240
         matrix:
@@ -61,7 +61,7 @@ steps:
           queue: "juliagpu"
           cuda: "*"
         env:
-          GROUP: "CUDA"
+          BACKEND_GROUP: "CUDA"
           DOWNSTREAM_TEST_REPO: "{{matrix.repo}}"
         if: build.message !~ /\[skip tests\]/ || build.message !~ /\[skip downstream\]/
         timeout_in_minutes: 240
@@ -111,7 +111,7 @@ steps:
           rocm: "*"
           rocmgpu: "*"
         env:
-          GROUP: "AMDGPU"
+          BACKEND_GROUP: "AMDGPU"
           JULIA_AMDGPU_CORE_MUST_LOAD: "1"
           JULIA_AMDGPU_HIP_MUST_LOAD: "1"
           JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -37,7 +37,7 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         env:
-          GROUP: "CPU"
+          BACKEND_GROUP: "CPU"
           RETESTITEMS_NWORKERS: 4
           RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1

diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
@@ -27,7 +27,7 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         env:
-          GROUP: "CPU"
+          BACKEND_GROUP: "CPU"
           RETESTITEMS_NWORKERS: 4
           RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1

diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
@@ -16,7 +16,7 @@ jobs:
     name: ${{ matrix.package.repo }}/${{ matrix.package.group }}
     runs-on: ${{ matrix.os }}
     env:
-      GROUP: ${{ matrix.package.group }}
+      BACKEND_GROUP: ${{ matrix.package.group }}
     strategy:
       fail-fast: false
       matrix:

diff --git a/.github/workflows/FormatCheck.yml b/.github/workflows/FormatCheck.yml
diff --git a/.github/workflows/QualityCheck.yml b/.github/workflows/QualityCheck.yml
@@ -0,0 +1,19 @@
+name: Code Quality Check
+
+on: [pull_request]
+
+jobs:
+  code-style:
+    name: Format Suggestions
+    runs-on: ubuntu-latest
+    steps:
+      - uses: julia-actions/julia-format@v3
+
+  typos-check:
+    name: Spell Check with Typos
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Actions Repository
+        uses: actions/checkout@v4
+      - name: Check spelling
+        uses: crate-ci/[email protected]
diff --git a/Project.toml b/Project.toml
@@ -4,7 +4,6 @@ authors = ["Avik Pal <[email protected]> and contributors"]
 version = "0.1.8"
 
 [deps]
-ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -21,7 +20,6 @@ WeightInitializersCUDAExt = "CUDA"
 
 [compat]
 Aqua = "0.8.7"
-ArgCheck = "2.3.0"
 CUDA = "5.3.2"
 ChainRulesCore = "1.23"
 Documenter = "1.5.0"

diff --git a/README.md b/README.md
@@ -8,7 +8,6 @@
 [![Build status](https://badge.buildkite.com/ffa2c8c3629cd58322446cddd3e8dcc4f121c28a574ee3e626.svg?branch=main)](https://buildkite.com/julialang/weightinitializers-dot-jl)
 [![CI](https://github.com/LuxDL/WeightInitializers.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/LuxDL/WeightInitializers.jl/actions/workflows/CI.yml)
 [![codecov](https://codecov.io/gh/LuxDL/WeightInitializers.jl/branch/main/graph/badge.svg?token=1ZY0A2NPEM)](https://codecov.io/gh/LuxDL/WeightInitializers.jl)
-[![Package Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/WeightInitializers)](https://pkgs.genieframework.com?packages=WeightInitializers)
 
 [![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor's%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
 [![SciML Code Style](https://img.shields.io/static/v1?label=code%20style&message=SciML&color=9558b2&labelColor=389826)](https://github.com/SciML/SciMLStyle)

diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl
@@ -1,8 +1,7 @@
 module WeightInitializersCUDAExt
 
 using CUDA: CUDA, CURAND
-using Random: Random, shuffle
-using WeightInitializers: WeightInitializers, NUM_TO_FPOINT, __partial_apply
+using WeightInitializers: WeightInitializers
 
 const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG}
 

diff --git a/src/initializers.jl b/src/initializers.jl
@@ -104,7 +104,8 @@ truncated normal distribution. The numbers are distributed like
 function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T(0),
         std=T(1), lo=-T(2), hi=T(2)) where {T <: Real}
     if (mean < lo - 2 * std) || (mean > hi + 2 * std)
-        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate."
+        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the \
+               distribution of values may be inaccurate."
     end
     l = _norm_cdf((T(lo) - T(mean)) / T(std))
     u = _norm_cdf((T(hi) - T(mean)) / T(std))
@@ -122,13 +123,12 @@ end
         gain = 1)  -> AbstractArray{T, length(dims)}
 
 Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a
-(semi) orthogonal matrix, as described in [^Saxe14]
+(semi) orthogonal matrix, as described in [1].
 
 The function constructs an orthogonal or semi-orthogonal matrix depending on the specified
-dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`.
-For more than two dimensions, it computes an orthogonal matrix of
-size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to
-the original dimensions.
+dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`. For more
+than two dimensions, it computes an orthogonal matrix of size `prod(dims[1:(end - 1)])` by
+`dims[end]` before reshaping it to the original dimensions.
 
 Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
@@ -141,9 +141,8 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
 # References
 
-[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of
-learning in deep linear neural networks",
-ICLR 2014, https://arxiv.org/abs/1312.6120
+[1] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in
+deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
 """
 function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         gain::Number=T(1.0)) where {T <: Number}
@@ -164,56 +163,52 @@ end
         sparsity::Number, std::Number=0.01) -> AbstractArray{T}
 
 Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements,
-using random numbers drawn from a normal distribution for the non-zero elements.
-This method is introduced in [^Martens2010].
-Note: The sparsity parameter controls the proportion of the matrix that will be zeroed.
-For example, a sparsity of 0.3 means that approximately 30% of the elements will be
-set to zero. The non-zero elements are distributed according to a normal distribution,
-scaled by the std parameter.
+using random numbers drawn from a normal distribution for the non-zero elements. This method
+was introduced in [1].
+
+!!! note
+
+    The sparsity parameter controls the proportion of the matrix that will be zeroed. For
+    example, a sparsity of 0.3 means that approximately 30% of the elements will be set to
+    zero. The non-zero elements are distributed according to a normal distribution, scaled
+    by the std parameter.
 
 # Arguments
 
   - `rng::AbstractRNG`: The random number generator to use.
   - `T::Type{<:Number}`: The numeric type of the elements in the returned array.
   - `dims::Integer...`: The dimensions of the weight matrix to be generated.
   - `sparsity::Number`: The proportion of elements to be zeroed. Must be between 0 and 1.
-  - `std::Number=0.01`: The standard deviation of the normal distribution
-    before applying `gain`.
+  - `std::Number=0.01`: The standard deviation of the normal distribution before applying
+    `gain`.
 
 # Returns
 
-  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims`
-    and type `T`.
+  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims` and type
+    `T`.
 
 # Examples
 
-```julia
-using Random
+```jldoctest
+julia> y = sparse_init(Xoshiro(123), Float32, 5, 5; sparsity=0.3, std=0.01);
 
-# Initialize a 5x5 sparsely initialized matrix with 30% sparsity
-rng = MersenneTwister(123)
-matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01)
-```
+julia> y isa Matrix{Float32}
+true
 
-```
-5×5 Matrix{Float64}:
-  0.0          0.00273815    0.00592403   0.0          0.0
-  0.00459416  -0.000754831  -0.00888936  -0.0077507    0.0
-  0.0         -0.00194229    0.0          0.0         -0.00468489
-  0.0114265    0.0           0.0         -0.00734886   0.00277726
- -0.00396679   0.0           0.00327215  -0.0071741   -0.00880897
+julia> size(y) == (5, 5)
+true
 ```
 
 # References
 
-[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization"
-_Proceedings of the 27th International Conference on International Conference
-on Machine Learning_. 2010.
+[1] Martens, J, "Deep learning via Hessian-free optimization" Proceedings of the 27th
+International Conference on International Conference on Machine Learning. 2010.
 """
 function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         sparsity::Number, std::Number=T(0.01)) where {T <: Number}
     if length(dims) != 2
-        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse \
+                             initialization."))
     end
 
     rows, cols = dims
@@ -250,8 +245,8 @@ most layers of a neural network. The identity mapping is scaled by the `gain` pa
   - Layers must have `input_size == output_size` for a perfect identity mapping.
     In cases where this condition is not met, the function pads extra dimensions with zeros.
   - For convolutional layers to achieve an identity mapping, kernel sizes must be odd,
-    and appropriate padding must be applied to ensure the output
-    feature maps are the same size as the input feature maps.
+    and appropriate padding must be applied to ensure the output feature maps are the same
+    size as the input feature maps.
 
 # Arguments
 
@@ -271,16 +266,21 @@ most layers of a neural network. The identity mapping is scaled by the `gain` pa
 
 # Examples
 
-```julia
-using Random
-
-# Identity matrix for fully connected layer
-identity_matrix = identity_init(MersenneTwister(123), Float32, 5, 5)
-
-# Identity tensor for convolutional layer
-identity_tensor = identity_init(MersenneTwister(123), Float32,        # Bias initialization
-    3, 3, 5,        # Matrix multiplication
-    5; gain=1.5, shift=(1, 0))
+```jldoctest
+julia> identity_init(Xoshiro(123), Float32, 5, 5)
+5×5 Matrix{Float32}:
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+ 1.0  1.0  1.0  1.0  1.0
+
+julia> identity_init(Xoshiro(123), Float32, 3, 3, 1, 1; gain=1.5)
+3×3×1×1 Array{Float32, 4}:
+[:, :, 1, 1] =
+ 0.0  0.0  0.0
+ 0.0  1.5  0.0
+ 0.0  0.0  0.0
 ```
 """
 function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;