Functionality to estimate Cholesky factors

msainsburydale · Jan 9, 2024 · 82f4212 · 82f4212
1 parent 0cad3ea
commit 82f4212
Show file tree

Hide file tree

Showing 6 changed files with 109 additions and 55 deletions.
diff --git a/docs/src/API/simulation.md b/docs/src/API/simulation.md
@@ -3,7 +3,9 @@
 
 ## Data simulators
 
-The philosophy of `NeuralEstimators` is to cater for arbitrary statistical models by having the user define their statistical model implicitly through simulated data. However, the following functions have been included as they may be helpful to others, and their source code provide an example for how a user could formulate code for their own model. If you've developed similar functions that you think may be helpful to others, please get in touch or make a pull request.
+The philosophy of `NeuralEstimators` is to cater for arbitrary statistical models by having the user define their statistical model implicitly through simulated data. However, the following functions have been included as they may be helpful to others, and their source code illustrates how a user could formulate code for their own model.
+
+See also [Distributions.jl](https://juliastats.org/Distributions.jl/stable/) for a large range of distributions implemented in Julia, and the package [RCall](https://juliainterop.github.io/RCall.jl/stable/) for calling R functions within Julia. 
 
 ```@docs
 simulategaussianprocess
@@ -30,7 +32,7 @@ maternchols
 
 ## Density functions
 
-Density functions are not needed in the workflow of `NeuralEstimators`. However, as part of a series of comparison studies between neural estimators and likelihood-based estimators given in the manuscript, we have developed the following density functions, and we include them in `NeuralEstimators` to cater for the possibility that they may be of use in future comparison studies.
+Density functions are not needed in the workflow of `NeuralEstimators`. However, as part of a series of comparison studies between neural estimators and likelihood-based estimators given in various paper, we have developed the following functions for evaluating the density function for several popular distributions. We include these in `NeuralEstimators` to cater for the possibility that they may be of use in future comparison studies.
 
 ```@docs
 gaussiandensity

diff --git a/docs/src/API/utility.md b/docs/src/API/utility.md
@@ -37,6 +37,8 @@ loadbestweights
 
 removedata
 
+rowwisenorm
+
 stackarrays
 
 vectotril

diff --git a/src/Architectures.jl b/src/Architectures.jl
@@ -418,9 +418,13 @@ triangularnumber(d) = d*(d+1)÷2
 
 @doc raw"""
     CovarianceMatrix(d)
-Layer for constructing the parameters of an unconstrained `d`×`d` covariance matrix.
+Layer that transforms a vector 𝐯 ∈ Rᵈ to the parameters of an unconstrained
+`d`×`d` covariance matrix, or the lower Cholesky factor of a `d`×`d` covariance
+matrix.
 
-The layer transforms a `Matrix` with `d`(`d`+1)÷2 rows into a `Matrix` of the
+The expected input is a `Matrix` with T(`d`) = `d`(`d`+1)÷2 rows, where T(`d`)
+is the `d`th triangular number (the number of free parameters in an
+unconstrained `d`×`d` covariance matrix), and the output is a `Matrix` of the
 same dimension.
 
 Internally, the layer constructs a valid Cholesky factor 𝐋 and then extracts
@@ -445,79 +449,92 @@ the rows of the matrix returned by a `CovarianceMatrix` are ordered as
 which means that the output can easily be transformed into the implied
 covariance matrices using [`vectotril`](@ref) and `Symmetric`.
 
+The Cholesky factor 𝐋 can be obtained directly by passing `true` when applying the layer (see below).
+
 # Examples
 ```
 using NeuralEstimators
+using Flux
 using LinearAlgebra
 
 d = 4
 l = CovarianceMatrix(d)
 p = d*(d+1)÷2
 θ = randn(p, 50)
 
-# returns a matrix of parameters
-θ = l(θ)
+# Returns a matrix of parameters, which can be converted to covariance matrices
+Σ = l(θ)
+Σ = [Symmetric(cpu(vectotril(x)), :L) for x ∈ eachcol(Σ)]
+Σ = [1]
 
-Σ = [Symmetric(cpu(vectotril(y)), :L) for y ∈ eachcol(θ)]
+# Obtain the Cholesky factor directly
+L = l(θ, true)
+L = [LowerTriangular(cpu(vectotril(x))) for x ∈ eachcol(L)]
+L[1]
+L[1] * L[1]'
 ```
 """
 struct CovarianceMatrix{T <: Integer, G, H}
-  d::T         # dimension of the matrix
-  p::T         # number of free parameters that in the covariance matrix, the triangular number T(d) = `d`(`d`+1)÷2
-  idx::G       # cartesian indices of lower triangle
-  diag_idx::H  # which of the T(d) rows correspond to the diagonal elements of the `d`×`d` covariance matrix
+  d::T          # dimension of the matrix
+  p::T          # number of free parameters that in the covariance matrix, the triangular number T(d) = `d`(`d`+1)÷2
+  tril_idx::G   # cartesian indices of lower triangle
+  diag_idx::H   # which of the T(d) rows correspond to the diagonal elements of the `d`×`d` covariance matrix (linear indices)
 end
 function CovarianceMatrix(d::Integer)
 	p = triangularnumber(d)
-	idx = tril(trues(d, d))
+	tril_idx = tril(trues(d, d))
 	diag_idx = [1]
 	for i ∈ 1:(d-1)
 		push!(diag_idx, diag_idx[i] + d-i+1)
 	end
-	return CovarianceMatrix(d, p, idx, diag_idx)
+	return CovarianceMatrix(d, p, tril_idx, diag_idx)
 end
-function (l::CovarianceMatrix)(v)
+function (l::CovarianceMatrix)(v, cholesky_only::Bool = false)
 
 	d = l.d
 	p, K = size(v)
 	@assert p == l.p "the number of rows must be the triangular number T(d) = d(d+1)÷2 = $(l.p)"
 
 	# Ensure that diagonal elements are positive
-	v = vcat([i ∈ l.diag_idx ? softplus.(v[i:i, :]) : v[i:i, :] for i ∈ 1:p]...)
+	L = vcat([i ∈ l.diag_idx ? softplus.(v[i:i, :]) : v[i:i, :] for i ∈ 1:p]...)
+	cholesky_only && return L
 
 	# Insert zeros so that the input v can be transformed into Cholesky factors
-	zero_mat = zero(v[1:d, :]) # NB Zygote does not like repeat()
+	zero_mat = zero(L[1:d, :]) # NB Zygote does not like repeat()
 	x = d:-1:1      # number of rows to extract from v
 	j = cumsum(x)   # end points of the v ranges
 	k = j .- x .+ 1 # start point of the v ranges
-	L = vcat(v[k[1]:j[1], :], [vcat(zero_mat[1:i.-1, :], v[k[i]:j[i], :]) for i ∈ 2:d]...)
+	L = vcat(L[k[1]:j[1], :], [vcat(zero_mat[1:i.-1, :], L[k[i]:j[i], :]) for i ∈ 2:d]...)
 
 	# Reshape to a three-dimensional array of Cholesky factors
 	L = reshape(L, d, d, K)
 
 	# Batched multiplication and transpose to compute covariance matrices
-	Σ = L ⊠ batched_transpose(L) # can alternatively use PermutedDimsArray(L, (2,1,3)) or permutedims(L, (2, 1, 3))
+	Σ = L ⊠ batched_transpose(L) # alternatively: PermutedDimsArray(L, (2,1,3)) or permutedims(L, (2, 1, 3))
 
 	# Extract the lower triangle of each matrix
-	θ = Σ[l.idx, :]
+	Σ = Σ[l.tril_idx, :]
 
-	return θ
+	return Σ
 end
-(l::CovarianceMatrix)(x::AbstractVector) = l(reshape(x, :, 1))
+(l::CovarianceMatrix)(v::AbstractVector) = l(reshape(v, :, 1))
 
 # Example input data helpful for prototyping:
-# d = 4
+# d = 3
 # K = 100
 # triangularnumber(d) = d*(d+1)÷2
-# p = triangularnumber(d)
+# p = triangularnumber(d-1)
 # v = collect(range(1, p*K))
 # v = reshape(v, p, K)
+# l = CorrelationMatrix(d)
 
 @doc raw"""
     CorrelationMatrix(d)
 Layer for constructing the parameters of an unconstrained `d`×`d` correlation matrix.
 
-The layer transforms a `Matrix` with (`d`-1)d`÷2 rows into a `Matrix` of the
+The expected input is a `Matrix` with T(`d`) = `d`(`d`+1)÷2 rows, where T(`d`)
+is the `d`th triangular number (the number of free parameters in an
+unconstrained `d`×`d` covariance matrix), and the output is a `Matrix` of the
 same dimension.
 
 Internally, the layer constructs a valid Cholesky factor 𝐋 for a correlation
@@ -542,40 +559,54 @@ R₂₁, R₃₁, R₃₂,
 which means that the output can easily be transformed into the implied
 correlation matrices using [`vectotril`](@ref) and `Symmetric`.
 
+The Cholesky factor 𝐋 can be obtained directly by passing `true` when applying the layer (see below).
+
 # Examples
 ```
 using NeuralEstimators
 using LinearAlgebra
+using Flux
 
-d = 4
-l = CorrelationMatrix(d)
-p = (d-1)*d÷2
-θ = randn(p, 5000)
-
-# returns a matrix of parameters
-θ = l(θ)
+d  = 4
+l  = CorrelationMatrix(d)
+p  = (d-1)*d÷2
+θ  = randn(p, 100)
 
-# convert matrix of parameters to implied correlation matrices
-R = map(eachcol(θ)) do y
-	R = Symmetric(cpu(vectotril(y, strict = true)), :L)
+# Returns a matrix of parameters, which can be converted to correlation matrices
+R = l(θ)
+R = map(eachcol(R)) do r
+	R = Symmetric(cpu(vectotril(r, strict = true)), :L)
 	R[diagind(R)] .= 1
 	R
 end
+R[1]
+
+# Obtain the Cholesky factor directly
+L = l(θ, true)
+L = map(eachcol(L)) do x
+	# Only the strict lower diagonal elements are returned
+	L = LowerTriangular(cpu(vectotril(x, strict = true)))
+
+	# Diagonal elements are determined under the constraint diag(L*L') = 𝟏
+	L[diagind(L)] .= sqrt.(1 .- rowwisenorm(L).^2)
+	L
+end
+L[1]
+L[1] * L[1]'
+R[1]
 ```
 """
 struct CorrelationMatrix{T <: Integer, G}
-  d::T         # dimension of the matrix
-  p::T         # number of free parameters that in the correlation matrix, the triangular number T(d-1) = (`d`-1)`d`÷2
-  idx::G       # cartesian indices of strict lower triangle
+  d::T                # dimension of the matrix
+  p::T                # number of free parameters that in the correlation matrix, the triangular number T(d-1) = (`d`-1)`d`÷2
+  tril_idx_strict::G  # cartesian indices of strict lower triangle
 end
 function CorrelationMatrix(d::Integer)
-	idx = tril(trues(d, d), -1)
+	tril_idx_strict = tril(trues(d, d), -1)
 	p = triangularnumber(d-1)
-	return CorrelationMatrix(d, p, idx)
+	return CorrelationMatrix(d, p, tril_idx_strict)
 end
-function (l::CorrelationMatrix)(v)
-
-	# TODO can I find a reference that this transformation does indeed span all correlation matrices?
+function (l::CorrelationMatrix)(v, cholesky_only::Bool = false)
 
 	d = l.d
 	p, K = size(v)
@@ -599,15 +630,14 @@ function (l::CorrelationMatrix)(v)
 	# Normalise the rows
 	L = L ./ rowwisenorm(L)
 
+	cholesky_only && return L[l.tril_idx_strict, :]
+
 	# Transpose and batched multiplication to compute correlation matrices
-	R = L ⊠ batched_transpose(L) # can alternatively use PermutedDimsArray(L, (2,1,3)) or permutedims(L, (2, 1, 3))
+	R = L ⊠ batched_transpose(L) # alternatively: PermutedDimsArray(L, (2,1,3)) or permutedims(L, (2, 1, 3))
 
 	# Extract the lower triangle of each matrix
-	θ = R[l.idx, :]
+	R = R[l.tril_idx_strict, :]
 
-  return θ
+  return R
 end
-(l::CorrelationMatrix)(x::AbstractVector) = l(reshape(x, :, 1))
-
-# TODO wonder if I should use the L1 norm, probably more numerically stable?
-rowwisenorm(A,dims=2) = sqrt.(sum(abs2,A; dims = dims))
+(l::CorrelationMatrix)(v::AbstractVector) = l(reshape(v, :, 1))
diff --git a/src/NeuralEstimators.jl b/src/NeuralEstimators.jl
@@ -64,7 +64,7 @@ include("plotting.jl")
 export bootstrap, interval
 include("bootstrap.jl")
 
-export stackarrays, expandgrid, loadbestweights, loadweights, numberreplicates, nparams, samplesize, drop, containertype, estimateinbatches
+export stackarrays, expandgrid, loadbestweights, loadweights, numberreplicates, nparams, samplesize, drop, containertype, estimateinbatches, rowwisenorm
 include("utility.jl")
 
 export NeuralEM, removedata, encodedata

diff --git a/src/utility.jl b/src/utility.jl
@@ -9,6 +9,11 @@ nested_eltype(x) = nested_eltype(typeof(x))
 nested_eltype(::Type{T}) where T <:AbstractArray = nested_eltype(eltype(T))
 nested_eltype(::Type{T}) where T = T
 
+"""
+	rowwisenorm(A)
+Computes the row-wise norm of a matrix `A`.
+"""
+rowwisenorm(A) = sqrt.(sum(abs2,A; dims = 2))
 
 # Original discussion: https://groups.google.com/g/julia-users/c/UARlZBCNlng
 vectotri_docs = """

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -378,7 +378,6 @@ end
 		testbackprop(l, dvc, p, K, 20)
 	end
 
-
 	@testset "CovarianceMatrix" begin
 
 		d = 4
@@ -392,11 +391,16 @@ end
 		@test size(θ̂) == (p, K)
 		@test length(l(θ[:, 1])) == p
 		@test typeof(θ̂) == typeof(θ)
-		testbackprop(l, dvc, p, K, d)
 
-		Σ = [Symmetric(cpu(vectotril(y)), :L) for y ∈ eachcol(θ̂)]
+		Σ = [Symmetric(cpu(vectotril(x)), :L) for x ∈ eachcol(θ̂)]
 		Σ = convert.(Matrix, Σ);
 		@test all(isposdef.(Σ))
+
+		L = l(θ, true)
+		L = [LowerTriangular(cpu(vectotril(x))) for x ∈ eachcol(L)]
+		@test all(Σ .≈ L .* permutedims.(L))
+
+		testbackprop(l, dvc, p, K, d)
 	end
 
 	A = rand(5,4)
@@ -415,13 +419,24 @@ end
 		@test typeof(θ̂) == typeof(θ)
 		@test all(-1 .<= θ̂ .<= 1)
 
-		R = map(eachcol(l(θ))) do y
-			R = Symmetric(cpu(vectotril(y; strict=true)), :L)
+		R = map(eachcol(l(θ))) do x
+			R = Symmetric(cpu(vectotril(x; strict=true)), :L)
 			R[diagind(R)] .= 1
 			R
 		end
 		@test all(isposdef.(R))
 
+		L = l(θ, true)
+		L = map(eachcol(L)) do x
+			# Only the strict lower diagonal elements are returned
+			L = LowerTriangular(cpu(vectotril(x, strict = true)))
+
+			# Diagonal elements are determined under the constraint diag(L*L') = 𝟏
+			L[diagind(L)] .= sqrt.(1 .- rowwisenorm(L).^2)
+			L
+		end
+		@test all(R .≈ L .* permutedims.(L))
+
 		testbackprop(l, dvc, p, K, d)
 	end
 end