-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkde.m
133 lines (119 loc) · 5.24 KB
/
kde.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
function [bandwidth,density,xmesh,cdf]=kde(data,n,MIN,MAX)
% Reliable and extremely fast kernel density estimator for one-dimensional data;
% Gaussian kernel is assumed and the bandwidth is chosen automatically;
% Unlike many other implementations, this one is immune to problems
% caused by multimodal densities with widely separated modes (see example). The
% estimation does not deteriorate for multimodal densities, because we never assume
% a parametric model for the data.
% INPUTS:
% data - a vector of data from which the density estimate is constructed;
% n - the number of mesh points used in the uniform discretization of the
% interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
% n is rounded up to the next power of two, i.e., n is set to n=2^ceil(log2(n));
% the default value of n is n=2^12;
% MIN, MAX - defines the interval [MIN,MAX] on which the density estimate is constructed;
% the default values of MIN and MAX are:
% MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
% OUTPUTS:
% bandwidth - the optimal bandwidth (Gaussian kernel assumed);
% density - column vector of length 'n' with the values of the density
% estimate at the grid points;
% xmesh - the grid over which the density estimate is computed;
% - If no output is requested, then the code automatically plots a graph of
% the density estimate.
% cdf - column vector of length 'n' with the values of the cdf
% Reference:
% Kernel density estimation via diffusion
% Z. I. Botev, J. F. Grotowski, and D. P. Kroese (2010)
% Annals of Statistics, Volume 38, Number 5, pages 2916-2957.
%
% Example:
% data=[randn(100,1);randn(100,1)*2+35 ;randn(100,1)+55];
% kde(data,2^14,min(data)-5,max(data)+5);
% Notes: If you have a more reliable and accurate one-dimensional kernel density
% estimation software, please email me at [email protected]
data=data(:); %make data a column vector
if nargin<2 % if n is not supplied switch to the default
n=2^14;
end
n=2^ceil(log2(n)); % round up n to the next power of 2;
if nargin<4 %define the default interval [MIN,MAX]
minimum=min(data); maximum=max(data);
Range=maximum-minimum;
MIN=minimum-Range/10; MAX=maximum+Range/10;
end
% set up the grid over which the density estimate is computed;
R=MAX-MIN; dx=R/(n-1); xmesh=MIN+[0:dx:R]; N=length(unique(data));
%bin the data uniformly using the grid defined above;
initial_data=histc(data,xmesh)/N; initial_data=initial_data/sum(initial_data);
a=dct1d(initial_data); % discrete cosine transform of initial data
% now compute the optimal bandwidth^2 using the referenced method
I=[1:n-1]'.^2; a2=(a(2:end)/2).^2;
% use fzero to solve the equation t=zeta*gamma^[5](t)
try
t_star=fzero(@(t)fixed_point(t,N,I,a2),[0,.1]);
catch
t_star=.28*N^(-2/5);
end
% smooth the discrete cosine transform of initial data using t_star
a_t=a.*exp(-[0:n-1]'.^2*pi^2*t_star/2);
% now apply the inverse discrete cosine transform
if (nargout>1)|(nargout==0)
density=idct1d(a_t)/R;
end
% take the rescaling of the data into account
bandwidth=sqrt(t_star)*R;
if nargout==0
figure(1), plot(xmesh,density)
end
% for cdf estimation
if nargout>3
f=2*pi^2*sum(I.*a2.*exp(-I*pi^2*t_star));
t_cdf=(sqrt(pi)*f*N)^(-2/3);
% now get values of cdf on grid points using IDCT and cumsum function
a_cdf=a.*exp(-[0:n-1]'.^2*pi^2*t_cdf/2);
cdf=cumsum(idct1d(a_cdf))*(dx/R);
% take the rescaling into account if the bandwidth value is required
bandwidth_cdf=sqrt(t_cdf)*R;
end
end
%################################################################
function out=fixed_point(t,N,I,a2)
% this implements the function t-zeta*gamma^[l](t)
l=7;
f=2*pi^(2*l)*sum(I.^l.*a2.*exp(-I*pi^2*t));
for s=l-1:-1:2
K0=prod([1:2:2*s-1])/sqrt(2*pi); const=(1+(1/2)^(s+1/2))/3;
time=(2*const*K0/N/f)^(2/(3+2*s));
f=2*pi^(2*s)*sum(I.^s.*a2.*exp(-I*pi^2*time));
end
out=t-(2*N*sqrt(pi)*f)^(-2/5);
end
%##############################################################
function out = idct1d(data)
% computes the inverse discrete cosine transform
[nrows,ncols]=size(data);
% Compute weights
weights = nrows*exp(i*(0:nrows-1)*pi/(2*nrows)).';
% Compute x tilde using equation (5.93) in Jain
data = real(ifft(weights.*data));
% Re-order elements of each column according to equations (5.93) and
% (5.94) in Jain
out = zeros(nrows,1);
out(1:2:nrows) = data(1:nrows/2);
out(2:2:nrows) = data(nrows:-1:nrows/2+1);
% Reference:
% A. K. Jain, "Fundamentals of Digital Image
% Processing", pp. 150-153.
end
%##############################################################
function data=dct1d(data)
% computes the discrete cosine transform of the column vector data
[nrows,ncols]= size(data);
% Compute weights to multiply DFT coefficients
weight = [1;2*(exp(-i*(1:nrows-1)*pi/(2*nrows))).'];
% Re-order the elements of the columns of x
data = [ data(1:2:end,:); data(end:-2:2,:) ];
% Multiply FFT by weights:
data= real(weight.* fft(data));
end