Removed Cholesky layer

msainsburydale · Jan 5, 2024 · 82a457b · 82a457b
1 parent deaeefe
commit 82a457b
Show file tree

Hide file tree

Showing 8 changed files with 110 additions and 152 deletions.
diff --git a/docs/src/API/architectures.md b/docs/src/API/architectures.md
@@ -35,8 +35,6 @@ neural estimator provides valid parameters.
 ```@docs
 Compress
 
-CholeskyCovariance
-
 CovarianceMatrix
 
 CorrelationMatrix

diff --git a/docs/src/API/utility.md b/docs/src/API/utility.md
@@ -40,6 +40,8 @@ removedata
 stackarrays
 
 vectotril
+
+vectocholesky
 ```
 
 ## Other

diff --git a/src/Architectures.jl b/src/Architectures.jl
@@ -319,7 +319,7 @@ function (d::DeepSetExpert)(Z::V) where {V <: AbstractVector{A}} where {A <: Abs
 	d.ϕ(u)
 end
 
-# Multiple data sets with set-level covariates 
+# Multiple data sets with set-level covariates
 function (d::DeepSetExpert)(tup::Tup) where {Tup <: Tuple{V₁, V₂}} where {V₁ <: AbstractVector{A}, V₂ <: AbstractVector{B}} where {A, B <: AbstractVector{T}} where {T}
 	Z = tup[1]
 	x = tup[2]
@@ -455,22 +455,41 @@ function (l::SplitApply)(x::AbstractArray)
 end
 
 
-# ---- Cholesky, Covariance, and Correlation matrices ----
+# ---- Layers to construct Covariance and Correlation matrices ----
+
+# Based on the approach described [here](https://stats.stackexchange.com/a/404453).
+# The mappping of `v` to the Cholesky factor is a well behaved diffeomorphism,
+# so it can be inverted; however, this inversion is not implemented.
+"""
+	vectocholesky(v::Vector; correlation::Bool = false)
+Transforms a vector `v` ∈ ℝᵈ, where d is a triangular number, into a valid
+Cholesky factor for either a correlation matrix or a covariance matrix.
+"""
+function vectocholesky(v; correlation::Bool = false)
+	if correlation
+		L = vectotril(v; strict = true)
+		L = unitdiagonal(L)
+	else
+		L = vectotril(v)
+	end
+	x = rowwisenorm(L)
+	L = L ./ x
+	LowerTriangular(L)
+end
+rowwisenorm(A,dims=2) = sqrt.(sum(abs2,A; dims = dims))
+unitdiagonal(A) = A - Diagonal(A[diagind(A)]) + I
 
 @doc raw"""
 	CorrelationMatrix(d)
 Layer for constructing the parameters of an unconstrained `d`×`d` correlation matrix.
 
-The layer transforms a `Matrix` with `d`(`d`-1)÷2 rows into a `Matrix` with
+The layer transforms a `Matrix` with (`d`-1)`d`÷2 rows into a `Matrix` with
 the same dimension.
 
-Internally, the layers uses the algorithm
-described [here](https://mc-stan.org/docs/reference-manual/cholesky-factors-of-correlation-matrices-1.html#cholesky-factor-of-correlation-matrix-inverse-transform)
-and [here](https://mc-stan.org/docs/reference-manual/correlation-matrix-transform.html#correlation-matrix-transform.section)
-to construct a valid Cholesky factor 𝐋, and then extracts the strict lower
-triangle from the positive-definite correlation matrix 𝐑 = 𝐋𝐋'. The strict lower
-triangle is extracted and vectorised in line with Julia's column-major ordering.
-For example, when modelling the correlation matrix,
+Internally, the layer constructs a valid Cholesky factor 𝐋 and then extracts
+the strict lower triangle from the positive-definite correlation matrix 𝐑 = 𝐋𝐋'.
+The lower triangle is extracted and vectorised in line with Julia's column-major
+ordering. For example, when modelling the correlation matrix
 
 ```math
 \begin{bmatrix}
@@ -480,15 +499,14 @@ R₃₁ & R₃₂ & 1\\
 \end{bmatrix},
 ```
 
-the rows of the matrix returned by a `CorrelationMatrix` layer will
-be ordered as
+the rows of the matrix returned by a `CorrelationMatrix` layer are ordered as
 
 ```math
 R₂₁, R₃₁, R₃₂,
 ```
 
 which means that the output can easily be transformed into the implied
-correlation matrices using the strict variant of [`vectotril`](@ref) and `Symmetric`.
+correlation matrices using [`vectotril`](@ref) and `Symmetric`.
 
 # Examples
 ```
@@ -521,98 +539,16 @@ function CorrelationMatrix(d::Integer)
 	return CorrelationMatrix(d, idx)
 end
 function (l::CorrelationMatrix)(x)
+	ArrayType = containertype(x)
 	p, K = size(x)
-	L = [vectocorrelationcholesky(x[:, k]) for k ∈ 1:K]
-	R = broadcast(x -> x*permutedims(x), L) # note that I replaced x' with permutedims(x) because Transpose/Adjoints don't work well with Zygote
+	L = [vectocholesky(x[:, k], correlation = true) for k ∈ 1:K]
+	R = broadcast(x -> x*permutedims(x), L) # permutedims(x) rather than x' because Transpose/Adjoints don't work well with Zygote
 	θ = broadcast(x -> x[l.idx], R)
-	return hcat(θ...)
-end
-function vectocorrelationcholesky(v)
-	ArrayType = containertype(v)
-	v = cpu(v)
-	z = tanh.(vectotril(v; strict=true))
-	n = length(v)
-	d = (-1 + isqrt(1 + 8n)) ÷ 2 + 1
-
-	L = [ correlationcholeskyterm(i, j, z)  for i ∈ 1:d, j ∈ 1:d ]
-	return convert(ArrayType, L)
-end
-function correlationcholeskyterm(i, j, z)
-	T = eltype(z)
-	if i < j
-		zero(T)
-	elseif 1 == i == j
-		one(T)
-	elseif 1 == j < i
-		z[i, j]
-	elseif 1 < j == i
-		prod(sqrt.(one(T) .- z[i, 1:j-i].^2))
-	else
-		z[i, j] * prod(sqrt.(one(T) .- z[i, 1:j-i].^2))
-	end
-end
-
-
-
-@doc raw"""
-	CholeskyCovariance(d)
-Layer for constructing the parameters of the lower Cholesky factor associated
-with an unconstrained `d`×`d` covariance matrix.
-
-The layer transforms a `Matrix` with `d`(`d`+1)÷2 rows into a `Matrix` of the
-same dimension, but with `d` rows constrained to be positive (corresponding to
-the diagonal elements of the Cholesky factor) and the remaining rows
-unconstrained.
-
-The ordering of the transformed `Matrix` aligns with Julia's column-major
-ordering. For example, when modelling the Cholesky factor,
-
-```math
-\begin{bmatrix}
-L₁₁ &     &     \\
-L₂₁ & L₂₂ &     \\
-L₃₁ & L₃₂ & L₃₃ \\
-\end{bmatrix},
-```
-
-the rows of the matrix returned by a `CholeskyCovariance` layer will
-be ordered as
-
-```math
-L₁₁, L₂₁, L₃₁, L₂₂, L₃₂, L₃₃,
-```
-
-which means that the output can easily be transformed into the implied
-Cholesky factors using [`vectotril`](@ref).
-
-# Examples
-```
-using NeuralEstimators
-
-d = 4
-p = d*(d+1)÷2
-θ = randn(p, 50)
-l = CholeskyCovariance(d)
-θ = l(θ)                              # returns matrix (used for Flux networks)
-L = [vectotril(y) for y ∈ eachcol(θ)] # convert matrix to Cholesky factors
-```
-"""
-struct CholeskyCovariance{T <: Integer, G}
-  d::T
-  diag_idx::G
-end
-function CholeskyCovariance(d::Integer)
-	diag_idx = [1]
-	for i ∈ 1:(d-1)
-		push!(diag_idx, diag_idx[i] + d-i+1)
-	end
-	CholeskyCovariance(d, diag_idx)
-end
-function (l::CholeskyCovariance)(x)
-	p, K = size(x)
-	y = [i ∈ l.diag_idx ? exp.(x[i, :]) : x[i, :] for i ∈ 1:p]
-	permutedims(reshape(vcat(y...), K, p))
+	θ = hcat(θ...)
+	θ = convert(ArrayType, θ)
+	return θ
 end
+(l::CorrelationMatrix)(x::AbstractVector) = l(reshape(x, :, 1))
 
 @doc raw"""
     CovarianceMatrix(d)
@@ -621,11 +557,10 @@ Layer for constructing the parameters of an unconstrained `d`×`d` covariance ma
 The layer transforms a `Matrix` with `d`(`d`+1)÷2 rows into a `Matrix` of the
 same dimension.
 
-Internally, it uses a `CholeskyCovariance` layer to construct a
-valid Cholesky factor 𝐋, and then extracts the lower triangle from the
-positive-definite covariance matrix 𝚺 = 𝐋𝐋'. The lower triangle is extracted
-and vectorised in line with Julia's column-major ordering. For example, when
-modelling the covariance matrix,
+Internally, the layer constructs a valid Cholesky factor 𝐋 and then extracts
+the lower triangle from the positive-definite covariance matrix 𝚺 = 𝐋𝐋'. The
+lower triangle is extracted and vectorised in line with Julia's column-major
+ordering. For example, when modelling the covariance matrix
 
 ```math
 \begin{bmatrix}
@@ -662,28 +597,20 @@ l = CovarianceMatrix(d)
 struct CovarianceMatrix{T <: Integer, G}
   d::T
   idx::G
-  choleskyparameters::CholeskyCovariance
 end
 function CovarianceMatrix(d::Integer)
 	idx = tril(trues(d, d))
 	idx = findall(vec(idx)) # convert to scalar indices
-	return CovarianceMatrix(d, idx, CholeskyCovariance(d))
+	return CovarianceMatrix(d, idx)
 end
-
 function (l::CovarianceMatrix)(x)
-	L = _constructL(l.choleskyparameters, x)
-	Σ = broadcast(x -> x*permutedims(x), L) # note that I replaced x' with permutedims(x) because Transpose/Adjoints don't work well with Zygote
+	ArrayType = containertype(x)
+	p, K = size(x)
+	L = [vectocholesky(x[:, k], correlation = false) for k ∈ 1:K]
+	Σ = broadcast(x -> x*permutedims(x), L) # permutedims(x) rather than x' because Transpose/Adjoints don't work well with Zygote
 	θ = broadcast(x -> x[l.idx], Σ)
-	return hcat(θ...)
-end
-
-function _constructL(l::CholeskyCovariance, x::Array)
-	Lθ = l(x)
-	K = size(Lθ, 2)
-	L = [vectotril(collect(view(Lθ, :, i))) for i ∈ 1:K]
-	L
+	θ = hcat(θ...)
+	θ = convert(ArrayType, θ)
+	return θ
 end
-
-(l::CholeskyCovariance)(x::AbstractVector) = l(reshape(x, :, 1))
 (l::CovarianceMatrix)(x::AbstractVector) = l(reshape(x, :, 1))
-(l::CorrelationMatrix)(x::AbstractVector) = l(reshape(x, :, 1))
diff --git a/src/NeuralEstimators.jl b/src/NeuralEstimators.jl
@@ -35,9 +35,8 @@ include("loss.jl")
 export ParameterConfigurations, subsetparameters
 include("Parameters.jl")
 
-export DeepSet, DeepSetExpert, Compress, SplitApply
-export CholeskyCovariance, CovarianceMatrix, CorrelationMatrix
-export vectotril, vectotriu
+export DeepSet, DeepSetExpert, Compress, SplitApply, CovarianceMatrix, CorrelationMatrix
+export vectotril, vectotriu, vectocholesky
 include("Architectures.jl")
 
 export NeuralEstimator, PointEstimator, IntervalEstimator, PiecewiseEstimator, initialise_estimator
@@ -73,6 +72,8 @@ include("missingdata.jl")
 
 end
 
+#TODO wonder if "Constrain" would be a better term than "Compress". Make it an alias for backwards compatability.
+
 #TODO
 # - Add helper functions for censored data and write an example in the documentation.
 # -	Plotting from Julia (which can dispatch on Assessment objects).

diff --git a/src/densities.jl b/src/densities.jl
@@ -41,7 +41,7 @@ function gaussiandensity(y::V, L::LT; logdensity::Bool = true) where {V <: Abstr
 end
 
 function gaussiandensity(y::A, L::LT; logdensity::Bool = true) where {A <: AbstractArray{T, N}, LT <: LowerTriangular} where {T, N}
-	l = mapslices(y -> gaussiandensity(vec(y), L, logdensity = logdensity), y, dims = 1:(N-1))
+	l = mapslices(y -> gaussiandensity(vec(y), L; logdensity = logdensity), y, dims = 1:(N-1))
 	return logdensity ? sum(l) : prod(l)
 end
 
@@ -50,6 +50,7 @@ function gaussiandensity(y::A, Σ::M; args...) where {A <: AbstractArray{T, N},
 	gaussiandensity(y, L; args...)
 end
 
+#TODO Add generalised-hyperbolic density once neural EM paper is finished.
 
 # ---- Bivariate density function for Schlather's model ----
 

diff --git a/src/missingdata.jl b/src/missingdata.jl
@@ -206,7 +206,7 @@ function removedata(Z::A, n::Integer;
 					contiguous_pattern::Bool = false,
 					variable_proportion::Bool = false
 					) where {A <: AbstractArray{T, N}} where {T, N}
-
+	if isa(Z, Vector) Z = reshape(Z, :, 1) end
 	m = size(Z)[end]           # number of replicates
 	d = prod(size(Z)[1:end-1]) # dimension of each replicate  NB assumes a singleton channel dimension
 
@@ -258,17 +258,26 @@ function removedata(Z::A, n::Integer;
 
 	return removedata(Z, Iᵤ)
 end
+function removedata(Z::V, n::Integer; args...) where {V <: AbstractVector{T}} where {T}
+	removedata(reshape(Z, :, 1), n)[:]
+end
 
-function removedata(Z::A, p::F; prevent_complete_missing::Bool = true) where {A <: AbstractArray{T, N}} where {T, N, F <: AbstractFloat}
-	d = prod(size(Z)[1:end-1]) # dimension of each replicate  NB assumes a singleton channel dimension
+
+function removedata(Z::A, p::F; args...) where {A <: AbstractArray{T, N}} where {T, N, F <: AbstractFloat}
+	if isa(Z, Vector) Z = reshape(Z, :, 1) end
+	d = prod(size(Z)[1:end-1]) # dimension of each replicate  NB assumes singleton channel dimension
 	p = repeat([p], d)
-	return removedata(Z, p; prevent_complete_missing = prevent_complete_missing)
+	return removedata(Z, p; args...)
+end
+function removedata(Z::V, p::F; args...) where {V <: AbstractVector{T}} where {T, F <: AbstractFloat}
+	removedata(reshape(Z, :, 1), p)[:]
 end
 
-function removedata(Z::A, p::Vector{F}; prevent_complete_missing::Bool = true) where {A <: AbstractArray{T, N}} where {T, N, F <: AbstractFloat}
 
+function removedata(Z::A, p::Vector{F}; prevent_complete_missing::Bool = true) where {A <: AbstractArray{T, N}} where {T, N, F <: AbstractFloat}
+	if isa(Z, Vector) Z = reshape(Z, :, 1) end
 	m = size(Z)[end]           # number of replicates
-	d = prod(size(Z)[1:end-1]) # dimension of each replicate  NB assumes a singleton channel dimension
+	d = prod(size(Z)[1:end-1]) # dimension of each replicate  NB assumes singleton channel dimension
 	@assert length(p) == d "The length of `p` should equal the dimenison d of each replicate"
 	multivariatebernoulli = Product([Bernoulli(p[i]) for i ∈ eachindex(p)])
 
@@ -292,6 +301,9 @@ function removedata(Z::A, p::Vector{F}; prevent_complete_missing::Bool = true) w
 
 	return removedata(Z, Iᵤ)
 end
+function removedata(Z::V, p::Vector{F}; args...) where {V <: AbstractVector{T}} where {T, F <: AbstractFloat}
+	removedata(reshape(Z, :, 1), p)[:]
+end
 
 
 function removedata(Z::A, Iᵤ::V) where {A <: AbstractArray{T, N}, V <: AbstractVector{I}} where {T, N, I <: Integer}
@@ -306,6 +318,7 @@ function removedata(Z::A, Iᵤ::V) where {A <: AbstractArray{T, N}, V <: Abstrac
 end
 
 
+
 """
 	encodedata(Z::A; fixed_constant::T = zero(T)) where {A <: AbstractArray{Union{Missing, T}, N}} where T, N
 For data `Z` with missing entries, returns an augmented data set (U, W) where

diff --git a/src/simulate.jl b/src/simulate.jl
@@ -96,6 +96,9 @@ function simulategaussianprocess(obj::M, m::Integer) where M <: Union{AbstractMa
 	return y
 end
 
+# TODO This should really dispatch on LowerTriangular, Symmetric, and
+# UpperTriangular. For backwards compatability, we can keep the following
+# method for L::AbstractMatrix. 
 function simulategaussianprocess(L::M) where M <: AbstractMatrix{T} where T <: Number
 	L * randn(T, size(L, 1))
 end
@@ -105,6 +108,8 @@ function simulategaussianprocess(grf::GaussianRandomField)
 end
 
 
+# TODO add simulateGH()
+
 # ---- Schlather's max-stable model ----
 
 """