📄 scalablekmeans.cpp

📁 Scalable k-means software and test datasets This package (a Unix tar file, gzipped) contains the sou
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Scalable K-means clustering softwareCopyright (C) 2000  Fredrik Farnstrom and James LewisThis program is free software; you can redistribute it and/ormodify it under the terms of the GNU General Public Licenseas published by the Free Software Foundation; either version 2of the License, or (at your option) any later version.This program is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.See the file README.TXT for more information.*//* scalablekmeans.cpp */#include <math.h>#include <stdio.h>#include <stdlib.h>#include "singleton.h"#include "subcluster.h"#include "rambuffer.h"#include "scalablekmeans.h"#include "database.h"#include "syntheticdata.h"#define OUTPUTFILES 0#define EARLYSTOPPING 1int ClusterNode::compareDistance(const void *a, const void *b){	ClusterNode *aa = (ClusterNode*)a, *bb = (ClusterNode*)b;	if(aa->distance > bb->distance)		return 1;	else if(aa->distance == bb->distance)		return 0;	else		return -1;}void ScalableKMeans::setup(void){	int i;	Subcluster *c;	MainCluster *m;	mainClusters = 0;	oldClusters = 0;	for(i = 0; i < numClusters; i++)	{	c = buffer.newDiscardCluster();		m = new MainCluster(buffer.getDimensions());		if(mainClusters)			mainClusters->setPrevious(m);		m->setNext(mainClusters);		m->id = i;		mainClusters = m;		c->associate(m);		m->associate(c);		c = new Subcluster(buffer.getDimensions());		if(oldClusters)			oldClusters->setPrevious(c);		c->setNext(oldClusters);		oldClusters = c;	}}void ScalableKMeans::initModel(void){	Subcluster *c, *c2;//	Singleton *p = buffer.getRetainedSet();	Singleton p(buffer.getDimensions());	int k = 0, i, j;	float *x;	database->reset();	c2 = oldClusters;	for(c = mainClusters; c; c = c->getNext())	{		j = (rand() >> 3) % 80;		for(i = 0; i <= j; i++)			x = database->getPoint();		p.set(x);		c->clear();		c->addPoint(&p);		c->updateMean();//		printf("%d ", k++);		c->print();//		p = p->getNext();		c2->clear();		c2->addCluster(c);		c2->updateMean();	}}int ScalableKMeans::fillBuffer(void){	long k = 0;	while(1)	{		// Call newSingleton() to see if new point fits		Singleton *p = buffer.newSingleton();		if(!p)		{			printf("%d points added to buffer.\n", k);			if(!k)				return 0;			return 1;		}		// If it fits,		//   get point from database		float *x = database->getPoint();		if(!x)		{			buffer.removeFromRetainedSet(p);			delete p;			break;		}		k++;		// Call set() on the new point.		p->set(x);		// Repeat until buffer full or no more points in database.//		p->print(stdout);	}	// Return 0 if database empty, 1 if buffer full	return 0;}void ScalableKMeans::primaryCompression(void){	Singleton *p, *np;	Subcluster *c, *q, *mc, *nc;	float d, md;	long pointcounter[numClusters];	int i, k, m;	ClusterNode *list;	long numdiscard;	float discardradius;	float stddev = confidenceStdDev;	long discarded = 0;	if(!primary1 && !primary2)		return;#if OUTPUTFILES == 1	FILE *f = fopen("primary.m", "w");	// Step through points in retainedSet.	// See if they can be discarded using Mahalanobis and/or perturbation	fprintf(f, "sd=%f\n", stddev);	fprintf(f, "c=[");#endif	for(c = mainClusters; c; c = c->getNext())	{		c->computeStdDev();#if OUTPUTFILES == 1		c->print(f);#endif	}#if OUTPUTFILES == 1	fprintf(f, "];\n");	fprintf(f, "ci=[");	for(c = mainClusters; c; c = c->getNext())//	for(c = buffer.getDiscardSet(); c; c = c->getNext())	{		c->printConfidence(f);	}	fprintf(f, "];\n");	fprintf(f, "p=[");#endif	if(primary1)	{	for(i = 0; i < numClusters; i++)		pointcounter[i] = 0;	for(p = buffer.getRetainedSet(); p; p = p->getNext())//		if(p->isNew())			pointcounter[((MainCluster*)p->getAssociation())->id]++;	m = 0;	for(i = 0; i < numClusters; i++)		if(pointcounter[i] > m)			m = pointcounter[i];	if(m)	{		list = new ClusterNode[m];		for(i = 0; i < numClusters; i++)		{			if(!pointcounter[i])				continue;			k = 0;			for(p = buffer.getRetainedSet(); k < pointcounter[i] && p;					p = p->getNext())				if(((MainCluster*)p->getAssociation())->id == i)					// && p->isNew()				{	list[k].point = p;					list[k++].distance = p->getAssociation()->							Mahalanobis(p);				}//			printf("Sort %d: ", i);			qsort(list, pointcounter[i], sizeof(ClusterNode), 					ClusterNode::compareDistance);//			for(k = 0; k < pointcounter[i]; k++)//				printf("%f ", list[k].distance);//			printf("");			numdiscard = (int)(ceil(pointcounter[i]*discardFraction) - 1);			if(numdiscard > 0)				discardradius = list[numdiscard].distance;			else				discardradius = 0;//			printf("Radius: %f\n", discardradius);			for(k = 0; k <= numdiscard; k++)			{	p = list[k].point;				p->getAssociation()->getAssociation()->addPoint(p);				buffer.removeFromRetainedSet(p);//				printf("Removed(1A)\n");				discarded++;#if OUTPUTFILES == 1				fprintf(f, "2 ");				p->print(f);#endif				delete p;			}//			printf("Discarded: %d/%d\n", numdiscard,//					pointcounter[i]);			for(p = buffer.getRetainedSet(); p; p = np)			{	np = p->getNext();				if(((MainCluster*)p->getAssociation())->id == i)				{	if(p->getAssociation()->Mahalanobis(p) < discardradius)					{	p->getAssociation()->getAssociation()->addPoint(p);						buffer.removeFromRetainedSet(p);//						printf("Removed(1B)\n");#if OUTPUTFILES == 1						fprintf(f, "2 ");						p->print(f);#endif						delete p;						discarded++;					}				}			}			for(c = buffer.getCompressedSet(); c; c = nc)			{				nc = c->getNext();				if(c->Mahalanobis(c->getAssociation()) < discardradius)				{					c->getAssociation()->getAssociation()->addCluster(c);					buffer.removeFromCompressedSet(c);//					printf("Secondary cluster discarded.\n");					delete c;				}			}		}		delete []list;	}	}//	printf("Discarded %d ", discarded);	discarded = 0;/*	for(c = buffer.getDiscardSet(); c; c = c->getNext())	{		c->updateMean();		c->computeStdDev();		c->print(f);	}*/	// Perturbation	if(primary2)	{	for(p = buffer.getRetainedSet(); p; p = np)	{		q = p->getAssociation();		np = p->getNext();		md = 1e10;		for(c = mainClusters; c; c = c->getNext())		{	d = c->perturb(p, c == q, stddev);			if(d < md)			{	md = d;				mc = c;			}		}		if(mc == q)		{	q->getAssociation()->addPoint(p);			buffer.removeFromRetainedSet(p);			//printf("Removed %d.\n", q->getAssociation());			discarded++;#if OUTPUTFILES == 1			fprintf(f, "1 ");			p->print(f);#endif			delete p;		}		else		{	//printf("Not removed.\n");#if OUTPUTFILES == 1			fprintf(f, "0 ");			p->print(f);#endif		}	}//	printf("%d\n", discarded);#if OUTPUTFILES == 1	fprintf(f, "];\n");	fprintf(f, "p0=find(p==0);\n");	fprintf(f, "p1=find(p==1);\n");	fprintf(f, "p2=find(p==2);\n");	fprintf(f, "clf\n");	fprintf(f, "plot(p(p0,2),p(p0,3),'.');\n");	fprintf(f, "hold on;\n");	fprintf(f, "plot(p(p1,2),p(p1,3),'+');\n");	fprintf(f, "plot(p(p2,2),p(p2,3),'x');\n");	fprintf(f, "plot(c(:,1),c(:,2),'o')\n");	fprintf(f, "for i=1:size(c,1)\n");	fprintf(f, "rectangle('Position', [c(i,1)-sd*ci(i,1) c(i,2)-sd*ci(i,2) 2*sd*ci(i,1) 2*sd*ci(i,2)]);\n");	fprintf(f, "end;\n");#endif	}#if OUTPUTFILES == 1	fclose(f);#endif}void ScalableKMeans::secondaryCompression(void){	int numclusters;	int i, j, dim = buffer.getDimensions();	MainCluster **newcluster = new MainCluster*[numNewSecondaryClusters];	Subcluster **allcluster;	Subcluster *c;	float *distances;	Singleton *p, *np;	int m, k, mi, mj;	float md, d;	float div = ((float)dim)*dim*numClusters;	int totalclusters;	long pointmem;	int oldnumclusters;	p = buffer.getRetainedSet();	numclusters = 0;	if(p)		pointmem = p->getMemUsed();	for(i = 0; i < numNewSecondaryClusters; i++)	{	if(p)		{	newcluster[i] = new MainCluster(dim);			newcluster[i]->addPoint(p);			newcluster[i]->updateMean();			p = p->getNext();			numclusters++;		}		else			newcluster[i] = 0;	}	// Cluster points in retainedSet	if(numclusters > 0)	do	{		for(p = buffer.getRetainedSet(); p; p = p->getNext())		{			md = 1e10;			m = -1;			for(i = 0; i < numclusters; i++)				if((d = newcluster[i]->distanceSquared(p)) < md)				{	md = d;					m = i;				}			p->associate(newcluster[m]);		}		for(i = 0; i < numclusters; i++)			newcluster[i]->clear();		for(p = buffer.getRetainedSet(); p; p = p->getNext())			p->getAssociation()->addPoint(p);		d = 0;		for(i = 0; i < numclusters; i++)		{	d += newcluster[i]->updateMean();//			printf("S%d ", i);//			newcluster[i]->print();		}		d /= div;		numSecondaryIterations++;	} while(d > secondaryConvergenceThreshold);	totalclusters = buffer.getCompressedSetSize();	// Find tight clusters	for(i = 0; i < numclusters; i++)	{		newcluster[i]->computeStdDev();		if(!newcluster[i]->isTight(beta))		{	//delete newcluster[i];			//newcluster[i] = 0;			newcluster[i]->id = 2;		}		else		{	totalclusters++;			newcluster[i]->id = 0;//			printf("Tight %d %d.\n", totalclusters - 1,//					newcluster[i]->getNumPoints());		}	}/*	for(i = 0; i < numclusters; i++)	{		if(!newcluster[i]->id && newcluster[i]->getMemUsed() <				newcluster[i]->getNumPoints()*pointmem)		{	newcluster[i]->id = 1;			printf("Marked cluster.\n");		}	}	for(p = buffer.getRetainedSet(); p; p = np)	{		np = p->getNext();		if(((MainCluster*)p->getAssociation())->id == 1)		{	buffer.removeFromRetainedSet(p);			delete p;			printf("Compressed.\n");		}	}	for(i = 0; i < numclusters; i++)	{		if(newcluster[i] && newcluster[i]->id == 1)		{	buffer.addSecondaryCluster(newcluster[i]);			newcluster[i] = 0;			printf("Added cluster.\n");		}	}*/	allcluster = new Subcluster*[totalclusters];	k = 0;	distances = new float[totalclusters*totalclusters];	for(c = buffer.getCompressedSet(); c; c = c->getNext())		allcluster[k++] = c;	oldnumclusters = k;	for(i = 0; i < numclusters; i++)	{	if(newcluster[i] && !newcluster[i]->id)			allcluster[k++] = newcluster[i];	}	for(i = 0; i < totalclusters - 1; i++)		for(j = i + 1; j < totalclusters; j++)			distances[i*totalclusters + j] = -1;	while(1)	{	md = 1e10;		mi = -1;		fflush(stdout);		for(i = 0; i < totalclusters - 1; i++)		{	for(j = i + 1; j < totalclusters; j++)			{//				printf("%d %d %f\n", i, j, distances[i*totalclusters + j]);				if((d = distances[i*totalclusters + j]) == -1)				{	d = distances[i*totalclusters + j] =							allcluster[i]->distanceSquared(allcluster[j]);				}				if(d >= 0 && d < md)				{	md = d;					mi = i;					mj = j;				}			}		}		if(mi < 0)			break;//		printf("totalclusters = %d\n", totalclusters);
12 下一页
💿 文件大小 33 K
👤 上传用户 yong_715
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#Scalable #datasets #contains #software
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -