forked from kdharris101/iss
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathScaledKMeans.m
63 lines (49 loc) · 2.06 KB
/
ScaledKMeans.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
function [k, v, s2] = ScaledKMeans(x, v0, ScoreThresh)
% [k, v] = ScaledKMeans(x, v0);
%
% does a clustering that minimizes the norm of x_i - g_i * v_{k_i}, where:
% x_i is the i'th row of input matrix x (nPoints by nDims)
% g_i is a gain (not explicitly computed or returned)
% v_k is the k'th row of output v containing cluster means (nClusters by nDims)
% k_i is the i'th entry of output k giving the cluster of each point (nPoints by 1)
% s2_k is the first eigenval of the outer product matrix for cluster k
% ScoreThresh: x_i will only contribute to v_k if dot product > ScoreThresh
%
% input v0 (nClusters by nDims) is the starting point. (Required.)
if nargin<3 || isempty(ScoreThresh)
%MAYBE SHOULD INCREASE THIS ITERITAVELY BELOW
ScoreThresh = 0; % only keep good matches
end
MinClusterSize = 10; % delete clusters with too few points
ConvergenceCriterion = 0; % if this many or less changed, terminate
% normalize starting points and original data
vNorm = bsxfun(@rdivide, v0, sqrt(sum(v0.^2,2)));
v = vNorm;
xNorm = bsxfun(@rdivide, x, sqrt(sum(x.^2,2)));
[nClusters, nDims] = size(v);
s2 = zeros(nClusters,1);
MaxIter = 100;
k = nan; % to make sure it doesn't finish on first iteration
for i=1:MaxIter
kOld = k;
score = xNorm*v'; % project each point onto each cluster. Use normalized so we can interpret score
[TopScore, k] = max(score,[],2); % find best cluster for each point
k(TopScore<ScoreThresh)=0; % unclusterable points
if sum(k~=kOld)<=ConvergenceCriterion % need a better criterion!
break;
end
% find top svd component for points assigned to each cluster
for c=1:nClusters
MyPoints = x(k==c,:); % don't use normalized, to avoid overweighting weak points
nMyPoints = length(MyPoints);
if nMyPoints<MinClusterSize
v(c,:) = 0;
continue;
end
[TopEvec, s2(c)] = eigs(double(MyPoints'*MyPoints)/nMyPoints, 1);
v(c,:) = TopEvec*sign(mean(TopEvec)); % make them positive
end
% figure(9046);
% hist(k,0:max(k));
% drawnow
end