⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 clarans.c

📁 数据挖掘经典的hierarchial clustering algorithm
💻 C
字号:
/****************************************************************
File Name: clarans.C
Author: Tian Zhang, CS Dept., Univ. of Wisconsin-Madison, 1995

               Copyright(c) 1995 by Tian Zhang

                   All Rights Reserved

Permission to use, copy and modify this software must be granted
by the author and provided that the above copyright notice appear
in all relevant copies and that both that copyright notice and this
permission notice appear in all relevant supporting documentations.

Comments and additions may be sent the author at zhang@cs.wisc.edu.

******************************************************************/

#include <assert.h>
#include "global.h"
#include "util.h"
#include "vector.h"
#include "rectangle.h"
#include "cfentry.h"
#include "cutil.h"
#include "parameter.h"
#include "clarans.h"

/* for CLARANS0 use only */
static double local_search0(int n, Vector *entries, int **group, int k, int neighbor,int passi)
{
int min, i, j, g, h, no_test;
double dist, mindist, total_dist, djh, dji, cih ;

int *oldgroup = new int[n];
assert(oldgroup!=NULL);
int *newgroup = new int[n];
assert(newgroup!=NULL);
int *tmpgroup;

short *selected = new short[n];
assert(selected!=NULL);
memset(selected,0,n*sizeof(short));

int *repre = new int[k];
for (i=0; i<k; i++)
{ j = n/k*i+passi;
  repre[i]=j;
  selected[j]=1;
  }

for (j=0, total_dist=0.0; j<n; j++) {
        for (i=0, mindist = HUGE; i<k; i++) {
                dist = entries[j]^entries[repre[i]];
                if (dist<mindist) {
                        mindist = dist;
                        oldgroup[j]=i;
                        }
                }
        total_dist += mindist;
        }

for (; ;) {
for (no_test=0; no_test<neighbor;) {
i = rand() % k;
h = rand() % n;
if (selected[h]) continue;
memcpy(newgroup,oldgroup,n*sizeof(int));

for (j=0, cih=0; j<n; j++) {
        djh = entries[j]^entries[h];
        if (oldgroup[j]==i) {
                dji = entries[j]^entries[repre[i]];
                if (djh <= dji) cih += djh-dji;
                else {
                  for (g=0,mindist=HUGE;g<k;g++) {
                   if (g==i) continue;
                   dist = entries[j]^entries[repre[g]];
                        if (dist<mindist) {
                                mindist=dist;
                                min = g;
                                }
                        }
                  if (mindist<djh) {
                        newgroup[j]=min;
                        cih += mindist-dji;
                        }
                  else cih += djh-dji;
                  }
        }
        else {
                mindist = entries[j]^entries[repre[oldgroup[j]]];
                if (djh < mindist) {
                        newgroup[j]=i;
                        cih += djh - mindist;
                        }
                }
        }
if (cih >= 0.0) no_test++;
else { // cih < 0 : improved
        selected[repre[i]]=0;
        selected[h]=1;
        repre[i]=h;
        tmpgroup = oldgroup;
        oldgroup = newgroup;
        newgroup = tmpgroup;
        total_dist += cih;
        break;
        }
}

if (no_test==neighbor) break;
}
delete [] selected;
delete [] repre;
delete [] newgroup;
*group = oldgroup;
return(total_dist);
}

/* for local_search1 use only */
static void entrycpy(Entry *ents1, Entry *ents2, int k)
{
for (int i=0;i<k;i++)
        ents1[i] = ents2[i];
}

/* for CLARANS1 use only */
static double local_search1(int n, Entry *entries, Entry **clusters, int k, short dtype, short qtype, int neighbor, int passi)
{
int i, j, g, h, min, no_test;
double dist, mindist, total_dist, qua, min_qua, djh, dji, cih;

int *repre = new int[k];
assert(repre!=NULL);

Entry *oldclusters = new Entry[k];
assert(oldclusters!=NULL);
for (i=0;i<k;i++)
        oldclusters[i].Init(entries[0].sx.dim);

Entry *newclusters = new Entry[k];
assert(newclusters!=NULL);
for (i=0;i<k;i++)
        newclusters[i].Init(entries[0].sx.dim);

Entry *tmpclusters;

short *selected = new short[n];
assert(selected!=NULL);
memset(selected,0,n*sizeof(short));

int *oldgroup = new int[n];
assert(oldgroup!=NULL);
int *newgroup = new int[n];
assert(newgroup!=NULL);
int *tmpgroup;

// select initial medoids : not randomly
for (i=0;i<k;i++) {
        j = i*n/k+passi;
        repre[i] = j;
        oldclusters[i] = entries[j];
        oldgroup[j] = i;
        selected[j] = 1;
        }

// assign initial groups and clusters
for (i=0,total_dist=0;i<n;i++) {
   if (selected[i]==1) continue;
   for (j=0, mindist = HUGE;j<k;j++) {
        dist = distance(dtype,entries[i],entries[repre[j]]);
        if (dist < mindist) {
                mindist = dist;
                oldgroup[i]=j;
                }
        }
   oldclusters[oldgroup[i]] += entries[i];
   total_dist += mindist;
   }

min_qua = Quality(qtype,k,oldclusters);

// random search
for (; ; ) {
for (no_test = 0; no_test < neighbor;) {
        i = rand() % k;
        h = rand() % n;
        if (selected[h]) continue;
        entrycpy(newclusters, oldclusters, k);
        memcpy(newgroup, oldgroup, n * sizeof(int));

        // for multiple distance definitions to work fine.
        selected[repre[i]]=0;
        selected[h]=1;
        newgroup[h] = i;
        newclusters[i] += entries[h];
        newclusters[oldgroup[h]] -= entries[h];

        for (j=0; j<n; j++) {   // scan data set
         if (selected[j]==1) continue;
         djh = distance(dtype,entries[j],entries[h]);
         if (oldgroup[j]==i) {
                dji = distance(dtype,entries[j],entries[repre[i]]);
                if (djh <= dji) cih += djh-dji;
                else {
                   for (g=0,mindist=HUGE; g<k; g++) {
                        if (g==i) continue;
                        dist = distance(dtype,entries[j],entries[repre[g]]);
                        if (dist < mindist) {
                                mindist = dist;
                                min = g;
                                }
                        }
                   if (mindist<djh) {
                                newgroup[j]=min;
                                cih += mindist-dji;
                                newclusters[i] -= entries[j];
                                newclusters[min] += entries[j];
                                }
                   else cih += djh-dji;
                   }
                }
        else { // (group[j]!=i)
                mindist = distance(dtype,entries[j],entries[repre[oldgroup[j]]]);
                // case1 djh >= djx : no operations
                // case2 djh < djx  : do operations
                if (djh < mindist) {
                   newgroup[j] = i;
                   cih += djh-mindist;
                   newclusters[oldgroup[j]] -= entries[j];
                   newclusters[i] += entries[j];
                   }
                }
        } // end for j

        qua = Quality(qtype, k, newclusters);
        if (qua >= min_qua) {
                selected[i]=1;
                selected[h]=0;
                no_test++;
                }
        else {
                min_qua=qua;
                total_dist += cih;
                repre[i]=h;
                tmpgroup = oldgroup;
                oldgroup = newgroup;
                newgroup = tmpgroup;
                tmpclusters = oldclusters;
                oldclusters = newclusters;
                newclusters = tmpclusters;
                break;
                }
      } // end for no_test

      if (no_test==neighbor) break;
   } // end of for (; ;)

delete [] repre;
delete [] selected;
delete [] oldgroup;
delete [] newgroup;
delete [] oldclusters;
*clusters = newclusters;
return(total_dist);
}

void Clarans0(int &n, const int K, Entry *entries)
{
int *group, *min_group=NULL;
int i;
int no_local = 2;
int no_neighbor = K*(n-K);
double total_dist, min_total_dist;

Vector *centroids = new Vector[n];
for (i=0;i<n;i++)
        centroids[i].Init(entries[0].sx.dim);

if (PERCENT(no_neighbor)>LOW_BOUND && PERCENT(no_neighbor)<HIGH_BOUND)
        no_neighbor = (int) floor(PERCENT(no_neighbor));
else if (PERCENT(no_neighbor)>=HIGH_BOUND)
        no_neighbor = HIGH_BOUND;
     else if (no_neighbor>LOW_BOUND)
        no_neighbor = LOW_BOUND;

for (i=0; i<n; i++) centroids[i].Div(entries[i].sx,entries[i].n);

for (i=0, min_total_dist=HUGE*K*n; i<no_local; i++)
{       total_dist = local_search0(n,centroids,&group,K,no_neighbor,i);
        if (total_dist<min_total_dist) {
                min_total_dist = total_dist;
                if (min_group!=NULL) delete [] min_group;
                min_group = group;
                }
}

Entry *clusters = new Entry[K];
for (i=0; i<K; i++)
        clusters[i].Init(entries[0].sx.dim);

for (i=0; i<n; i++)
        clusters[min_group[i]] += entries[i];
for (i=0; i<K; i++)
        entries[i]=clusters[i];
n = K;

delete [] min_group;
delete [] clusters;
delete [] centroids;
}

void Clarans1(int &n, const int K, Entry *entries, short GDtype, short Qtype)
{
int i;
int no_local = 2;
int no_neighbor = K * (n - K);
double min_qua,qua,min_total_dist,total_dist;
Entry *clusters , *min_clusters=NULL;

if (PERCENT(no_neighbor)>LOW_BOUND && PERCENT(no_neighbor)<HIGH_BOUND)
        no_neighbor = (int) floor(PERCENT(no_neighbor));
else if (PERCENT(no_neighbor)>=HIGH_BOUND)
        no_neighbor = HIGH_BOUND;
     else if (no_neighbor>LOW_BOUND)
        no_neighbor = LOW_BOUND;

min_qua = HUGE * n * K;

for (i=0, min_qua=HUGE*n*K, min_total_dist=HUGE*n*K;i<no_local;i++) {
  total_dist = local_search1(n,entries,&clusters,K,GDtype,Qtype,no_neighbor,i);
  qua = Quality(Qtype,K,clusters);
  if (qua < min_qua) {
        min_qua= qua;
        if (min_clusters!=NULL) delete [] min_clusters;
        min_clusters = clusters;
        }
  }

for (i=0; i<K; i++)
        entries[i]=min_clusters[i];
delete [] min_clusters;
n = K;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -