📄 mainclustervalidationnc.m

📁 cluster validation tools matlab toolbox
💻 M
字号:
% Cluster-Validation for Estimating the Number of Clusters (Version 2.0)
% based on the clustering results of PAM or K-means clustering algorithm
% Please read the help file "Readme.txt" before running this program
% Kaijun WANG: sunice9@yahoo.com, April 2007.

clear;
alg = 1;                     % 1 --- PAM, 2 --- K-means
newp=1;                   % opening a new figure window ?
subp=0;                    % plotting in which sub-window
pcolor=1;                  % color plotting ?
pc=1;                        % plotting data by Principal Component Analysis
staz=0;                     % standardization to [0 1] when mixed metric
Hsep=0;                   % threshold for System Evolution method
nk=1;                        % skip computation of error rate if NC is unknown
N2=10;                     % searching limit is max(N2,nk+6)
type = 1;                   % 1 - using Euclidean distances for general data;
                                  % 2 - Pearson correlation coefficients for gene data
                                 % it is preseted in row 64: if id > 20  type = 2; end

% Part 1: selecting a data set, data file: rows - data points, columns - dimensions
id = 31;

switch id
%(1) general simulated data, Euclidean distances, true labels in 1st column
case 1
     sw='4k2bigsmall_far.txt';               nk=4; % true number of clusters
case 2 
     sw='4k2bigsmall_lap.txt';               nk=4; 
case 3
     sw='8k2close.txt';                           nk=8; 
case 4 
     sw='8k2lap.txt';                               nk=8; 

% true labels being unknown (1st column is data too)
case 11
     sw='yourdata.txt'; 
     
%(2) simulated gene data, Pearson distances, true labels in 1st column
case 21 
     sw='6k20_close.txt';                      nk=6; 
case 22
     sw='6k40_far.txt';                           nk=6; 
case 23 
     sw='4k20_lap.txt';                           nk=4;
case 24 
     sw='4k40_lap.txt';                           nk=4; 
     
%(3) real gene data, Pearson distances, true labels in 1st column
case 31
     sw='leuk72_3k.txt';                         nk=3; 
case 32
     sw='lym96_4k.txt';                          nk=4; 
case 33
     sw='g205_4k.txt';                            nk=4; 
case 34
     sw='y208_4k.txt';                            nk=4; 
     
% real gene data, true labels being unknown (1st column is data too)
case 41
     sw='yourdata.txt'; 
end

% initialization
if id > 20
  type = 2;
end
data = load(sw);
[nrow, dim] = size(data);
N1=2;                          % a forward search starting at k = 2
N=max([N2,nk+6]);     % stopping at k = N
truelabels = ones(nrow,1);
if id < 11 || (id > 20 && id < 40)  % when 1st column is class labels
   truelabels = data(:,1); 
   data = data(:,2:dim);
   dim = dim-1;
end

% calculating dis-similarity/distance matrix of a data set
if type == 2
  % Pearson similarity [-1,1] is normalized to Pearson distance [0,1]
  Dist = 1-(1+similarity_pearson(data'))/2; 
  for j = 1:nrow
     Dist(j,j) = 0;
  end
  dc = 2;                         % a sign to call energy_pearson.m
  dmax = 1;                    % max value of data
else
   if staz 
       data = standarz(data); end            % columns are standardized within [0,1]
   [Dist,dmax]= similarity_euclid(data); % Euclidean distances between rows
   dc = 1;                       % a sign to call energy_euclid.m
end
dissim = [];
if  alg == 1
  for i = 2:nrow
   dissim = [dissim Dist(i,1:i-1)];  % dissimilarity vector
  end
end

% Part 2: Running PAM or K-means clustering algorithm
fprintf('\n  ==> Estimating the Number of Clusters for example  %d', id);
classlabel = ones(nrow,N);
vtype = 4*ones(1,dim);
Rd = 'euclidean';
if id > 20
   Rd = 'correlation';
end

for i = N1 : N
  if  alg == 1
    fprintf('\n  = = running on PAM clustering at k= %d', i);
    Scluster = pam(dissim', i);  %Scluster = pam(data, i, vtype);
    classlabel(:,i) = double(Scluster.ncluv)';
  else
    fprintf('\n  = = running on K-means clustering at k= %d', i);
    classlabel(:,i) = kmeans(data, i, 'distance', Rd);
  end
  Q = ind2cluster(classlabel(:,i)); 
  % less than 4 data points in one cluster, stop !
  ns = [];
  for j =1:numel(Q)
     ns(j) = numel(Q{j});
  end
  if min(ns) < 4
     N = i-1;
     break;
  end
end

% Part 3:  Estimating the number of clusters by validity indices
validity_Index

% Part 4: System evolution method gives Number of Clusters
SystemEvolution_energy
SystemEvolution_findk
km=km(1);
plotindice(ters(:,1:g2),g1,g2,km,1,0,pcolor);
% plotdata_bylabels(data,classlabel(:,km(1)),pc,subp,'nb');
% clf; plotdata_bylabels(data,truelabels,2,0,'nb');
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -