📄 scalablekmeans.cpp
字号:
/* Scalable K-means clustering softwareCopyright (C) 2000 Fredrik Farnstrom and James LewisThis program is free software; you can redistribute it and/ormodify it under the terms of the GNU General Public Licenseas published by the Free Software Foundation; either version 2of the License, or (at your option) any later version.This program is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.See the file README.TXT for more information.*//* scalablekmeans.cpp */#include <math.h>#include <stdio.h>#include <stdlib.h>#include "singleton.h"#include "subcluster.h"#include "rambuffer.h"#include "scalablekmeans.h"#include "database.h"#include "syntheticdata.h"#define OUTPUTFILES 0#define EARLYSTOPPING 1int ClusterNode::compareDistance(const void *a, const void *b){ ClusterNode *aa = (ClusterNode*)a, *bb = (ClusterNode*)b; if(aa->distance > bb->distance) return 1; else if(aa->distance == bb->distance) return 0; else return -1;}void ScalableKMeans::setup(void){ int i; Subcluster *c; MainCluster *m; mainClusters = 0; oldClusters = 0; for(i = 0; i < numClusters; i++) { c = buffer.newDiscardCluster(); m = new MainCluster(buffer.getDimensions()); if(mainClusters) mainClusters->setPrevious(m); m->setNext(mainClusters); m->id = i; mainClusters = m; c->associate(m); m->associate(c); c = new Subcluster(buffer.getDimensions()); if(oldClusters) oldClusters->setPrevious(c); c->setNext(oldClusters); oldClusters = c; }}void ScalableKMeans::initModel(void){ Subcluster *c, *c2;// Singleton *p = buffer.getRetainedSet(); Singleton p(buffer.getDimensions()); int k = 0, i, j; float *x; database->reset(); c2 = oldClusters; for(c = mainClusters; c; c = c->getNext()) { j = (rand() >> 3) % 80; for(i = 0; i <= j; i++) x = database->getPoint(); p.set(x); c->clear(); c->addPoint(&p); c->updateMean();// printf("%d ", k++); c->print();// p = p->getNext(); c2->clear(); c2->addCluster(c); c2->updateMean(); }}int ScalableKMeans::fillBuffer(void){ long k = 0; while(1) { // Call newSingleton() to see if new point fits Singleton *p = buffer.newSingleton(); if(!p) { printf("%d points added to buffer.\n", k); if(!k) return 0; return 1; } // If it fits, // get point from database float *x = database->getPoint(); if(!x) { buffer.removeFromRetainedSet(p); delete p; break; } k++; // Call set() on the new point. p->set(x); // Repeat until buffer full or no more points in database.// p->print(stdout); } // Return 0 if database empty, 1 if buffer full return 0;}void ScalableKMeans::primaryCompression(void){ Singleton *p, *np; Subcluster *c, *q, *mc, *nc; float d, md; long pointcounter[numClusters]; int i, k, m; ClusterNode *list; long numdiscard; float discardradius; float stddev = confidenceStdDev; long discarded = 0; if(!primary1 && !primary2) return;#if OUTPUTFILES == 1 FILE *f = fopen("primary.m", "w"); // Step through points in retainedSet. // See if they can be discarded using Mahalanobis and/or perturbation fprintf(f, "sd=%f\n", stddev); fprintf(f, "c=[");#endif for(c = mainClusters; c; c = c->getNext()) { c->computeStdDev();#if OUTPUTFILES == 1 c->print(f);#endif }#if OUTPUTFILES == 1 fprintf(f, "];\n"); fprintf(f, "ci=["); for(c = mainClusters; c; c = c->getNext())// for(c = buffer.getDiscardSet(); c; c = c->getNext()) { c->printConfidence(f); } fprintf(f, "];\n"); fprintf(f, "p=[");#endif if(primary1) { for(i = 0; i < numClusters; i++) pointcounter[i] = 0; for(p = buffer.getRetainedSet(); p; p = p->getNext())// if(p->isNew()) pointcounter[((MainCluster*)p->getAssociation())->id]++; m = 0; for(i = 0; i < numClusters; i++) if(pointcounter[i] > m) m = pointcounter[i]; if(m) { list = new ClusterNode[m]; for(i = 0; i < numClusters; i++) { if(!pointcounter[i]) continue; k = 0; for(p = buffer.getRetainedSet(); k < pointcounter[i] && p; p = p->getNext()) if(((MainCluster*)p->getAssociation())->id == i) // && p->isNew() { list[k].point = p; list[k++].distance = p->getAssociation()-> Mahalanobis(p); }// printf("Sort %d: ", i); qsort(list, pointcounter[i], sizeof(ClusterNode), ClusterNode::compareDistance);// for(k = 0; k < pointcounter[i]; k++)// printf("%f ", list[k].distance);// printf(""); numdiscard = (int)(ceil(pointcounter[i]*discardFraction) - 1); if(numdiscard > 0) discardradius = list[numdiscard].distance; else discardradius = 0;// printf("Radius: %f\n", discardradius); for(k = 0; k <= numdiscard; k++) { p = list[k].point; p->getAssociation()->getAssociation()->addPoint(p); buffer.removeFromRetainedSet(p);// printf("Removed(1A)\n"); discarded++;#if OUTPUTFILES == 1 fprintf(f, "2 "); p->print(f);#endif delete p; }// printf("Discarded: %d/%d\n", numdiscard,// pointcounter[i]); for(p = buffer.getRetainedSet(); p; p = np) { np = p->getNext(); if(((MainCluster*)p->getAssociation())->id == i) { if(p->getAssociation()->Mahalanobis(p) < discardradius) { p->getAssociation()->getAssociation()->addPoint(p); buffer.removeFromRetainedSet(p);// printf("Removed(1B)\n");#if OUTPUTFILES == 1 fprintf(f, "2 "); p->print(f);#endif delete p; discarded++; } } } for(c = buffer.getCompressedSet(); c; c = nc) { nc = c->getNext(); if(c->Mahalanobis(c->getAssociation()) < discardradius) { c->getAssociation()->getAssociation()->addCluster(c); buffer.removeFromCompressedSet(c);// printf("Secondary cluster discarded.\n"); delete c; } } } delete []list; } }// printf("Discarded %d ", discarded); discarded = 0;/* for(c = buffer.getDiscardSet(); c; c = c->getNext()) { c->updateMean(); c->computeStdDev(); c->print(f); }*/ // Perturbation if(primary2) { for(p = buffer.getRetainedSet(); p; p = np) { q = p->getAssociation(); np = p->getNext(); md = 1e10; for(c = mainClusters; c; c = c->getNext()) { d = c->perturb(p, c == q, stddev); if(d < md) { md = d; mc = c; } } if(mc == q) { q->getAssociation()->addPoint(p); buffer.removeFromRetainedSet(p); //printf("Removed %d.\n", q->getAssociation()); discarded++;#if OUTPUTFILES == 1 fprintf(f, "1 "); p->print(f);#endif delete p; } else { //printf("Not removed.\n");#if OUTPUTFILES == 1 fprintf(f, "0 "); p->print(f);#endif } }// printf("%d\n", discarded);#if OUTPUTFILES == 1 fprintf(f, "];\n"); fprintf(f, "p0=find(p==0);\n"); fprintf(f, "p1=find(p==1);\n"); fprintf(f, "p2=find(p==2);\n"); fprintf(f, "clf\n"); fprintf(f, "plot(p(p0,2),p(p0,3),'.');\n"); fprintf(f, "hold on;\n"); fprintf(f, "plot(p(p1,2),p(p1,3),'+');\n"); fprintf(f, "plot(p(p2,2),p(p2,3),'x');\n"); fprintf(f, "plot(c(:,1),c(:,2),'o')\n"); fprintf(f, "for i=1:size(c,1)\n"); fprintf(f, "rectangle('Position', [c(i,1)-sd*ci(i,1) c(i,2)-sd*ci(i,2) 2*sd*ci(i,1) 2*sd*ci(i,2)]);\n"); fprintf(f, "end;\n");#endif }#if OUTPUTFILES == 1 fclose(f);#endif}void ScalableKMeans::secondaryCompression(void){ int numclusters; int i, j, dim = buffer.getDimensions(); MainCluster **newcluster = new MainCluster*[numNewSecondaryClusters]; Subcluster **allcluster; Subcluster *c; float *distances; Singleton *p, *np; int m, k, mi, mj; float md, d; float div = ((float)dim)*dim*numClusters; int totalclusters; long pointmem; int oldnumclusters; p = buffer.getRetainedSet(); numclusters = 0; if(p) pointmem = p->getMemUsed(); for(i = 0; i < numNewSecondaryClusters; i++) { if(p) { newcluster[i] = new MainCluster(dim); newcluster[i]->addPoint(p); newcluster[i]->updateMean(); p = p->getNext(); numclusters++; } else newcluster[i] = 0; } // Cluster points in retainedSet if(numclusters > 0) do { for(p = buffer.getRetainedSet(); p; p = p->getNext()) { md = 1e10; m = -1; for(i = 0; i < numclusters; i++) if((d = newcluster[i]->distanceSquared(p)) < md) { md = d; m = i; } p->associate(newcluster[m]); } for(i = 0; i < numclusters; i++) newcluster[i]->clear(); for(p = buffer.getRetainedSet(); p; p = p->getNext()) p->getAssociation()->addPoint(p); d = 0; for(i = 0; i < numclusters; i++) { d += newcluster[i]->updateMean();// printf("S%d ", i);// newcluster[i]->print(); } d /= div; numSecondaryIterations++; } while(d > secondaryConvergenceThreshold); totalclusters = buffer.getCompressedSetSize(); // Find tight clusters for(i = 0; i < numclusters; i++) { newcluster[i]->computeStdDev(); if(!newcluster[i]->isTight(beta)) { //delete newcluster[i]; //newcluster[i] = 0; newcluster[i]->id = 2; } else { totalclusters++; newcluster[i]->id = 0;// printf("Tight %d %d.\n", totalclusters - 1,// newcluster[i]->getNumPoints()); } }/* for(i = 0; i < numclusters; i++) { if(!newcluster[i]->id && newcluster[i]->getMemUsed() < newcluster[i]->getNumPoints()*pointmem) { newcluster[i]->id = 1; printf("Marked cluster.\n"); } } for(p = buffer.getRetainedSet(); p; p = np) { np = p->getNext(); if(((MainCluster*)p->getAssociation())->id == 1) { buffer.removeFromRetainedSet(p); delete p; printf("Compressed.\n"); } } for(i = 0; i < numclusters; i++) { if(newcluster[i] && newcluster[i]->id == 1) { buffer.addSecondaryCluster(newcluster[i]); newcluster[i] = 0; printf("Added cluster.\n"); } }*/ allcluster = new Subcluster*[totalclusters]; k = 0; distances = new float[totalclusters*totalclusters]; for(c = buffer.getCompressedSet(); c; c = c->getNext()) allcluster[k++] = c; oldnumclusters = k; for(i = 0; i < numclusters; i++) { if(newcluster[i] && !newcluster[i]->id) allcluster[k++] = newcluster[i]; } for(i = 0; i < totalclusters - 1; i++) for(j = i + 1; j < totalclusters; j++) distances[i*totalclusters + j] = -1; while(1) { md = 1e10; mi = -1; fflush(stdout); for(i = 0; i < totalclusters - 1; i++) { for(j = i + 1; j < totalclusters; j++) {// printf("%d %d %f\n", i, j, distances[i*totalclusters + j]); if((d = distances[i*totalclusters + j]) == -1) { d = distances[i*totalclusters + j] = allcluster[i]->distanceSquared(allcluster[j]); } if(d >= 0 && d < md) { md = d; mi = i; mj = j; } } } if(mi < 0) break;// printf("totalclusters = %d\n", totalclusters);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -