📄 hierarchy.c

📁 数据挖掘经典的hierarchial clustering algorithm
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12

double Hierarchy::MinFtMerged(short ftype)
{
int MinI;
double tmpFt, MinFt=HUGE_DOUBLE;
for (int i=0; i<=chainptr; i++) {
        if (chain[i]<0) {
                tmpFt=cf[-chain[i]-1].Fitness(ftype);
                if (tmpFt<MinFt) {MinFt=tmpFt; MinI=i;}
                }
        }

// prepare for physically merging the MinI cluster
stopchain=FALSE;
chainptr=0;
chain[0]=chain[MinI];
return MinFt;
}

double Hierarchy::DistortionEdge(Entry *entries)
{
double TotalDistort=0;
for (int i=0; i<=chainptr; i++) {
        if (chain[i]<0)
                TotalDistort+=cf[-chain[i]-1].N()*cf[-chain[i]-1].Radius();
        else TotalDistort+=entries[chain[i]-1].N()*entries[chain[i]-1].Radius();
        }
return TotalDistort;
}

double Hierarchy::NmaxFtMerged(short ftype)
{
double NmaxFt;
int    maxn;
maxn = 0; NmaxFt=0;
for (int i=0; i<=chainptr; i++) {
        if (chain[i]<0 && cf[-chain[i]-1].N()>maxn) {
                maxn = cf[-chain[i]-1].N();
                NmaxFt = cf[-chain[i]-1].Fitness(ftype);
                }
        }
return NmaxFt;
}

double Hierarchy::TotalFtDEdge(short ftype, short d, Entry *entries)
{
double TotalFtD=0;
for (int i=0; i<=chainptr; i++) {
        if (chain[i]<0)
                TotalFtD+=pow(cf[-chain[i]-1].Fitness(ftype),1.0*d);
        else TotalFtD+=pow(entries[chain[i]-1].Fitness(ftype),1.0*d);
        }
return TotalFtD;
}

/***********************************************************
This is an O(n^3) algorithm, but works fine for all 5
(D0,D1,D2,D3,D4) distance definitions.
It is used to make sure Hierarchy1 and Hierarchy0 can
produce the same results for debugging purpose.
************************************************************/

void Hierarchy0(int &n,          // final number of clusters
                const int K,     // final number of clusters
                Entry **entries,
                short GDtype,
                short Ftype,
                double Ft)
{

if (n<=1) return;

int     i, j, imin, jmin, done;
short   *checked = new short[n];
memset(checked,0,n*sizeof(short));
        // 0: unchecked;
        // -1: exceeds the given threshold if merged with nearest neighbor;
        // -2: nonexistant after merging.

double  *dist = new double[n*(n-1)/2];
double  d, dmin;
Entry   tmpent;
tmpent.Init((*entries)[0].sx.dim);

dmin = HUGE;    // compute all initial distances and closest pair
for (i=0; i<n-1; i++)
  for (j=i+1; j<n; j++) {
        d = distance(GDtype,(*entries)[i],(*entries)[j]);
        dist[i*n-i*(i+1)/2+j-i-1] = d;
        if (d<dmin) {
                dmin = d;
                imin = i;
                jmin = j;
                }
        }

if (K==0) {// ****** case 1 ****** cluster by threshold ft
done = FALSE;
while (done==FALSE) {
        tmpent.Add((*entries)[imin],(*entries)[jmin]);
        if (tmpent.Fitness(Ftype) < Ft) {
        // within the threshold
                (*entries)[imin] += (*entries)[jmin];
                checked[jmin] = -2;
                for (i=0; i<imin; i++) {
                   if (checked[i]==0) {
                   dist[i*n-i*(i+1)/2+imin-i-1] =
                        distance(GDtype,(*entries)[i],(*entries)[imin]);
                   }}
                for (j=imin+1; j<n; j++) {
                   if (checked[j]==0) {
                   dist[imin*n-imin*(imin+1)/2+j-imin-1] =
                        distance(GDtype,(*entries)[imin],(*entries)[j]);
                   }}
                }
        else {
        // exceeds the threshold
                checked[imin] = -1;
                checked[jmin] = -1;
                }

        done = TRUE;
        dmin = HUGE;
        for (i=0; i<n-1; i++) {
          if (checked[i]==0) {
          for (j=i+1; j<n; j++) {
                if (checked[j]==0) {
                        d = dist[i*n-i*(i+1)/2+j-i-1];
                        if (d<dmin) {
                                done = FALSE;
                                dmin = d;
                                imin = i;
                                jmin = j;
        }}}}}
        } // end of while
} // end of if
else { // ***** case 2 ***** cluster by number k
done = n;
while (done > K) {
        (*entries)[imin] += (*entries)[jmin];
        checked[jmin] = -2;
        done--;
        for (i=0; i<imin; i++) {
           if (checked[i]==0) {
           dist[i*n-i*(i+1)/2+imin-i-1] =
                distance(GDtype,(*entries)[i],(*entries)[imin]);
           }}
        for (j=imin+1; j<n; j++) {
           if (checked[j]==0) {
           dist[imin*n-imin*(imin+1)/2+j-imin-1] =
                distance(GDtype,(*entries)[imin],(*entries)[j]);
           }}

        dmin = HUGE;
        for (i=0; i<n-1; i++) {
          if (checked[i]==0) {
          for (j=i+1; j<n; j++) {
                if (checked[j]==0) {
                d = dist[i*n-i*(i+1)/2+j-i-1];
                if (d<dmin) {
                        dmin = d;
                        imin = i;
                        jmin = j;
        }}}}}
        } // end of while
} // end of else

j = 0;
for (i=0; i<n; i++)
   if (checked[i]!=-2) {
        (*entries)[j]=(*entries)[i];
        j++;
        }
n=j;

delete [] checked;
delete [] dist;
}

/*********************************************************************
This is an O(n^2) algorithm, but works only for D2 and D4 due to the
"REDUCIBILITY PROPERTY" that is given as below:
if d(i,j) < p, d(i,k) > p, d(j,k) > p then d(i+j,k) > p.
Algorithm:
Part 1: Form the complete hierarchy in O(n^2) time
        step1: Pick any cluster i[1].
        step2: Determine the chain i[2]=NN(i[1]), i[3]=NN(i[2]) ...
               until i[k]=NN(i[k-1]) and i[k-1]=NN(i[k]), where NN
               means nearest neighbor.
        step3: Merge clusters i[k-1] and i[k].
        step4: If more than 1 clusters left, goto step using i[k-2]
               as start if (k-2>0), otherwise choose arbitrary i[1].
Part 2: Split from the complete hierarchy in O(n^2)
                by number of clusters K,
                by the threshold Ft,
                by tree height H,
                ...
*********************************************************************/

void Hierarchy1(int &n,         // final number of clusters
                const int K,    // final number of clusters
                Entry **entries,// array storing clusters
                short GDtype,
                short Ftype,
                double Ft)
{
if (GDtype!=D2 && GDtype!=D4)
        print_error("Hierarchy1","only support distance D2 and D4");

if (n<=1) return;
if (n==K) return;

// ****************** Part 1 **********************

Hierarchy *h = new Hierarchy(n-1,(*entries)[0].sx.dim);
h->MergeHierarchy(GDtype,n,*entries);

// ******************* Part 2 *********************

if (K==0) {     // by the threshold
        while (h->SplitHierarchy(BY_THRESHOLD,Ftype,Ft)==TRUE);
        }
else    {               // by the number of clusters
        while (h->chainptr+1<K &&
               h->SplitHierarchy(BY_KCLUSTERS,Ftype,Ft)==TRUE);
        }

// ******************* Part 3 *********************

int j;
Entry *tmpentries = new Entry[h->chainptr+1];
for (j=0;j<=h->chainptr;j++)
        tmpentries[j].Init((*entries)[0].sx.dim);

for (j=0;j<=h->chainptr;j++) {
        if (h->chain[j]<0)
                tmpentries[j] = h->cf[-h->chain[j]-1];
        else    tmpentries[j] = (*entries)[h->chain[j]-1];
        }

delete [] *entries;
*entries = tmpentries;
n=h->chainptr+1;
delete h;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -