⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 kmeans.m

📁 Matlab的kmeans算法实现
💻 M
📖 第 1 页 / 共 2 页
字号:

function converged = batchUpdate

% Every point moved, every cluster will need an update
moved = 1:n;
changed = 1:k;
previdx = zeros(n,1);
prevtotsumD = Inf;

if display > 2 % 'iter'
    disp(sprintf('  iter\t phase\t     num\t         sum'));
end

%
% Begin phase one:  batch reassignments
%

iter = 0;
converged = false;
while true
    iter = iter + 1;

    % Calculate the new cluster centroids and counts, and update the
    % distance from every point to those new cluster centroids
    [C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance);
    D(:,changed) = distfun(X, C(changed,:), distance, iter);

    % Deal with clusters that have just lost all their members
    empties = changed(m(changed) == 0);
    if ~isempty(empties)
        switch emptyact
        case 'error'
            error('stats:kmeans:EmptyCluster', ...
                  'Empty cluster created at iteration %d%s.',iter,repsMsg(rep,reps));
        case 'drop'
            % Remove the empty cluster from any further processing
            D(:,empties) = NaN;
            changed = changed(m(changed) > 0);
            warning('stats:kmeans:EmptyCluster', ...
                    'Empty cluster created at iteration %d%s.',iter,repsMsg(rep,reps));
        case 'singleton'
            warning('stats:kmeans:EmptyCluster', ...
                    'Empty cluster created at iteration %d%s.',iter,repsMsg(rep,reps));

            for i = empties
                d = D((idx-1)*n + (1:n)'); % use newly updated distances

                % Find the point furthest away from its current cluster.
                % Take that point out of its cluster and use it to create
                % a new singleton cluster to replace the empty one.
                [dlarge, lonely] = max(d);
                from = idx(lonely); % taking from this cluster
                if m(from) < 2
                    % In the very unusual event that the cluster had only
                    % one member, pick any other non-singleton point.
                    from = find(m>1,1,'first');
                    lonely = find(idx==from,1,'first');
                end
                C(i,:) = X(lonely,:);
                m(i) = 1;
                idx(lonely) = i;
                D(:,i) = distfun(X, C(i,:), distance, iter);

                % Update clusters from which points are taken
                [C(from,:), m(from)] = gcentroids(X, idx, from, distance);
                D(:,from) = distfun(X, C(from,:), distance, iter);
                changed = unique([changed from]);
            end
        end
    end

    % Compute the total sum of distances for the current configuration.
    totsumD = sum(D((idx-1)*n + (1:n)'));
    % Test for a cycle: if objective is not decreased, back out
    % the last step and move on to the single update phase
    if prevtotsumD <= totsumD
        idx = previdx;
        [C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance);
        iter = iter - 1;
        break;
    end
    if display > 2 % 'iter'
        disp(sprintf(dispfmt,iter,1,length(moved),totsumD));
    end
    if iter >= maxit
        break;
    end

    % Determine closest cluster for each point and reassign points to clusters
    previdx = idx;
    prevtotsumD = totsumD;
    [d, nidx] = min(D, [], 2);

    % Determine which points moved
    moved = find(nidx ~= previdx);
    if ~isempty(moved)
        % Resolve ties in favor of not moving
        moved = moved(D((previdx(moved)-1)*n + moved) > d(moved));
    end
    if isempty(moved)
        converged = true;
        break;
    end
    idx(moved) = nidx(moved);

    % Find clusters that gained or lost members
    changed = unique([idx(moved); previdx(moved)])';

end % phase one

end % nested function

%------------------------------------------------------------------

function converged = onlineUpdate

% Initialize some cluster information prior to phase two
switch distance
case 'cityblock'
    Xmid = zeros([k,p,2]);
    for i = 1:k
        if m(i) > 0
            % Separate out sorted coords for points in i'th cluster,
            % and save values above and below median, component-wise
            Xsorted = sort(X(idx==i,:),1);
            nn = floor(.5*m(i));
            if mod(m(i),2) == 0
                Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
            elseif m(i) > 1
                Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
            else
                Xmid(i,:,1:2) = Xsorted([1, 1],:)';
            end
        end
    end
case 'hamming'
    Xsum = zeros(k,p);
    for i = 1:k
        if m(i) > 0
            % Sum coords for points in i'th cluster, component-wise
            Xsum(i,:) = sum(X(idx==i,:), 1);
        end
    end
end

%
% Begin phase two:  single reassignments
%
changed = find(m' > 0);
lastmoved = 0;
nummoved = 0;
iter1 = iter;
converged = false;
while iter < maxit
    % Calculate distances to each cluster from each point, and the
    % potential change in total sum of errors for adding or removing
    % each point from each cluster.  Clusters that have not changed
    % membership need not be updated.
    %
    % Singleton clusters are a special case for the sum of dists
    % calculation.  Removing their only point is never best, so the
    % reassignment criterion had better guarantee that a singleton
    % point will stay in its own cluster.  Happily, we get
    % Del(i,idx(i)) == 0 automatically for them.
    switch distance
    case 'sqeuclidean'
        for i = changed
            mbrs = (idx == i);
            sgn = 1 - 2*mbrs; % -1 for members, 1 for nonmembers
            if m(i) == 1
                sgn(mbrs) = 0; % prevent divide-by-zero for singleton mbrs
            end
            Del(:,i) = (m(i) ./ (m(i) + sgn)) .* sum((X - C(repmat(i,n,1),:)).^2, 2);
        end
    case 'cityblock'
        for i = changed
            if mod(m(i),2) == 0 % this will never catch singleton clusters
                ldist = Xmid(repmat(i,n,1),:,1) - X;
                rdist = X - Xmid(repmat(i,n,1),:,2);
                mbrs = (idx == i);
                sgn = repmat(1-2*mbrs, 1, p); % -1 for members, 1 for nonmembers
                Del(:,i) = sum(max(0, max(sgn.*rdist, sgn.*ldist)), 2);
            else
                Del(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2);
            end
        end
    case {'cosine','correlation'}
        % The points are normalized, centroids are not, so normalize them
        normC = sqrt(sum(C.^2, 2));
        if any(normC < eps(class(normC))) % small relative to unit-length data points
            error('stats:kmeans:ZeroCentroid', ...
                  'Zero cluster centroid created at iteration %d%s.',iter,repsMsg(rep,reps));
        end
        % This can be done without a loop, but the loop saves memory allocations
        for i = changed
            XCi = X * C(i,:)';
            mbrs = (idx == i);
            sgn = 1 - 2*mbrs; % -1 for members, 1 for nonmembers
            Del(:,i) = 1 + sgn .*...
                  (m(i).*normC(i) - sqrt((m(i).*normC(i)).^2 + 2.*sgn.*m(i).*XCi + 1));
        end
    case 'hamming'
        for i = changed
            if mod(m(i),2) == 0 % this will never catch singleton clusters
                % coords with an unequal number of 0s and 1s have a
                % different contribution than coords with an equal
                % number
                unequal01 = find(2*Xsum(i,:) ~= m(i));
                numequal01 = p - length(unequal01);
                mbrs = (idx == i);
                Di = abs(X(:,unequal01) - C(repmat(i,n,1),unequal01));
                Del(:,i) = (sum(Di, 2) + mbrs*numequal01) / p;
            else
                Del(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2) / p;
            end
        end
    end

    % Determine best possible move, if any, for each point.  Next we
    % will pick one from those that actually did move.
    previdx = idx;
    prevtotsumD = totsumD;
    [minDel, nidx] = min(Del, [], 2);
    moved = find(previdx ~= nidx);
    if ~isempty(moved)
        % Resolve ties in favor of not moving
        moved = moved(Del((previdx(moved)-1)*n + moved) > minDel(moved));
    end
    if isempty(moved)
        % Count an iteration if phase 2 did nothing at all, or if we're
        % in the middle of a pass through all the points
        if (iter == iter1) || nummoved > 0
            iter = iter + 1;
            if display > 2 % 'iter'
                disp(sprintf(dispfmt,iter,2,nummoved,totsumD));
            end
        end
        converged = true;
        break;
    end

    % Pick the next move in cyclic order
    moved = mod(min(mod(moved - lastmoved - 1, n) + lastmoved), n) + 1;

    % If we've gone once through all the points, that's an iteration
    if moved <= lastmoved
        iter = iter + 1;
        if display > 2 % 'iter'
            disp(sprintf(dispfmt,iter,2,nummoved,totsumD));
        end
        if iter >= maxit, break; end
        nummoved = 0;
    end
    nummoved = nummoved + 1;
    lastmoved = moved;

    oidx = idx(moved);
    nidx = nidx(moved);
    totsumD = totsumD + Del(moved,nidx) - Del(moved,oidx);

    % Update the cluster index vector, and the old and new cluster
    % counts and centroids
    idx(moved) = nidx;
    m(nidx) = m(nidx) + 1;
    m(oidx) = m(oidx) - 1;
    switch distance
    case 'sqeuclidean'
        C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
        C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
    case 'cityblock'
        for i = [oidx nidx]
            % Separate out sorted coords for points in each cluster.
            % New centroid is the coord median, save values above and
            % below median.  All done component-wise.
            Xsorted = sort(X(idx==i,:),1);
            nn = floor(.5*m(i));
            if mod(m(i),2) == 0
                C(i,:) = .5 * (Xsorted(nn,:) + Xsorted(nn+1,:));
                Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
            else
                C(i,:) = Xsorted(nn+1,:);
                if m(i) > 1
                    Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
                else
                    Xmid(i,:,1:2) = Xsorted([1, 1],:)';
                end
            end
        end
    case {'cosine','correlation'}
        C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
        C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
    case 'hamming'
        % Update summed coords for points in each cluster.  New
        % centroid is the coord median.  All done component-wise.
        Xsum(nidx,:) = Xsum(nidx,:) + X(moved,:);
        Xsum(oidx,:) = Xsum(oidx,:) - X(moved,:);
        C(nidx,:) = .5*sign(2*Xsum(nidx,:) - m(nidx)) + .5;
        C(oidx,:) = .5*sign(2*Xsum(oidx,:) - m(oidx)) + .5;
    end
    changed = sort([oidx nidx]);
end % phase two

end % nested function

end % main function

%------------------------------------------------------------------

function D = distfun(X, C, dist, iter)
%DISTFUN Calculate point to cluster centroid distances.
[n,p] = size(X);
D = zeros(n,size(C,1));
nclusts = size(C,1);

switch dist
case 'sqeuclidean'
    for i = 1:nclusts
        D(:,i) = (X(:,1) - C(i,1)).^2;
        for j = 2:p
            D(:,i) = D(:,i) + (X(:,j) - C(i,j)).^2;
        end
        % D(:,i) = sum((X - C(repmat(i,n,1),:)).^2, 2);
    end
case 'cityblock'
    for i = 1:nclusts
        D(:,i) = abs(X(:,1) - C(i,1));
        for j = 2:p
            D(:,i) = D(:,i) + abs(X(:,j) - C(i,j));
        end
        % D(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2);
    end
case {'cosine','correlation'}
    % The points are normalized, centroids are not, so normalize them
    normC = sqrt(sum(C.^2, 2));
    if any(normC < eps(class(normC))) % small relative to unit-length data points
        error('stats:kmeans:ZeroCentroid', ...
              'Zero cluster centroid created at iteration %d.',iter);
    end
    
    for i = 1:nclusts
        D(:,i) = max(1 - X * (C(i,:)./normC(i))', 0);
    end
case 'hamming'
    for i = 1:nclusts
        D(:,i) = abs(X(:,1) - C(i,1));
        for j = 2:p
            D(:,i) = D(:,i) + abs(X(:,j) - C(i,j));
        end
        D(:,i) = D(:,i) / p;
        % D(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2) / p;
    end
end
end % function

%------------------------------------------------------------------

function [centroids, counts] = gcentroids(X, index, clusts, dist)
%GCENTROIDS Centroids and counts stratified by group.
[n,p] = size(X);
num = length(clusts);
centroids = NaN(num,p);
counts = zeros(num,1);

for i = 1:num
    members = (index == clusts(i));
    if any(members)
        counts(i) = sum(members);
        switch dist
        case 'sqeuclidean'
            centroids(i,:) = sum(X(members,:),1) / counts(i);
        case 'cityblock'
            % Separate out sorted coords for points in i'th cluster,
            % and use to compute a fast median, component-wise
            Xsorted = sort(X(members,:),1);
            nn = floor(.5*counts(i));
            if mod(counts(i),2) == 0
                centroids(i,:) = .5 * (Xsorted(nn,:) + Xsorted(nn+1,:));
            else
                centroids(i,:) = Xsorted(nn+1,:);
            end
        case {'cosine','correlation'}
            centroids(i,:) = sum(X(members,:),1) / counts(i); % unnormalized
        case 'hamming'
            % Compute a fast median for binary data, component-wise
            centroids(i,:) = .5*sign(2*sum(X(members,:), 1) - counts(i)) + .5;
        end
    end
end
end % function

%------------------------------------------------------------------

function s = repsMsg(rep,reps)
% Utility for warning and error messages.
if reps == 1
    s = '';
else
    s = sprintf(' during replicate %d',rep);
end
end % function

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -