⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 epc2d.c

📁 一个投影聚类算法及其数据集生成源码。 参考文献: Eric K.K. Ng, A. Fu : Efficient algorithm for Projected Clustering,
💻 C
📖 第 1 页 / 共 4 页
字号:
//		printf ("(%d)%d ", i, base[i]);	}//	printf ("\n");	for (n=0; n<dataset->nopoint; n++)	{		noitem = 0;		for (i=0; i<no_entry; i++)	{				if (dataset->point[n].signature_2d[i] !=0)				noitem++;		}		if (noitem > 0)	{			printf ("%d ", noitem);			for (i=1; i<no_entry; i++)	{					if (dataset->point[n].signature_2d[i] != 0)//					printf ("%d (%d)(%d)", dataset->point[n].signature_2d[i]+base[i], dataset->point[n].signature_2d[i], base[i]);						printf ("%d ", dataset->point[n].signature_2d[i]+base[i]);			}			printf ("\n");		}	}}void assign_Clusters_2d (Signature_Bit_List *signature_bit_list, Cluster_2d *theclusters, int nodimension){	int i, j;	int k, l;	Signature_Bitptr current_signature = signature_bit_list->next->next_signature;	ID_Planeptr current_plane;	int no_bounded;	int WordNo = (nodimension-1)/32;	for (i=0; i<signature_bit_list->no_signature; i++)	{		no_bounded = 0;		for (k=0; k<=WordNo; k++)	{			for (l=0; l<32; l++)	{				if ((((current_signature->bitmap[k])>>l) & 1) == 1)					no_bounded++;			}		}				theclusters[i].no_plane = current_signature->no_id_plane;		theclusters[i].dimension1 = (int *)malloc (sizeof (int) * current_signature->no_id_plane);		theclusters[i].dimension2 = (int *)malloc (sizeof (int) * current_signature->no_id_plane);		theclusters[i].dense_id = (int *)malloc (sizeof (int) * current_signature->no_id_plane);		current_plane = current_signature->next_plane->next;		for (j=0; j<current_signature->no_id_plane; j++)	{			theclusters[i].dimension1[j] = current_plane->dimension1;			theclusters[i].dimension2[j] = current_plane->dimension2;			theclusters[i].dense_id[j] = current_plane->id[0];			current_plane = current_plane -> next;		}		current_signature = current_signature->next_signature;	}}int dense_id_2_base (int position, int nodimension, Histogram_2d *histogram){	int j, k, m, base=0;	int dimension1, dimension2;	dense_id_2_dim (position, nodimension, &dimension1, &dimension2);	for (j=0; j<dimension1; j++)	{		for (k=j+1; k<nodimension; k++)	{			if (j==dimension1 && k == dimension2)					return (base);			base += histogram[j*nodimension+k].no_dense;		}	}}void dense_id_2_dim (int position, int nodimension, int *dimension1, int *dimension2){	int i, j;	int temp = position+1;	*dimension1=0;	for (i=nodimension-1; temp>0; i--)	{		(*dimension1)++;		temp -= i;	}		i++;	(*dimension1)--;	temp += i;	*dimension2 = *dimension1+temp;}	void assign_Points_2d (FILE *fp, Dataset *dataset, Cluster_2d *clusters_2d, int no_cluster, Histogram_2d *histogram, int left_margin, int right_margin){	long n;	int interval_length;	int i, j, k, l;	float *likeliness;	int match;	int interval1, interval2;	float max;	int maxlike;	/* calculate the corresponding interval length */	interval_length = (right_margin-left_margin)/NO_INTERVAL;	likeliness = (float *)malloc (sizeof (float) * no_cluster);	/* For each Data point */	for (n=0; n<dataset->nopoint; n++)	{		fprintf (fp, "Point %d\n", n);
#ifndef WIN32		bzero (likeliness, sizeof (float ) * no_cluster);
#else
		memset(likeliness, 0, sizeof(float)*no_cluster);
#endif		/* calculate the likeliness to each cluster */		for (k=0; k<no_cluster; k++)	{			match=0;			for (l=0; l<clusters_2d[k].no_plane; l++)	{				i = clusters_2d[k].dimension1[l];				j = clusters_2d[k].dimension2[l];						/* find interval where the point locate */				interval1 = (dataset->point[n].component[i]-left_margin)/interval_length;					/* handle the case coordinate = margin */				if (interval1==NO_INTERVAL)					interval1--;				else if (interval1<0)					interval1++;				interval2 = (dataset->point[n].component[j]-left_margin)/interval_length;				/* handle the case coordinate = margin */				if (interval2==NO_INTERVAL)					interval2--;				else if (interval2<0)					interval2++;								if (histogram[i*dataset->nodimension+j].dense_id[interval1*NO_INTERVAL+interval2] == clusters_2d[k].dense_id[l])	{					match ++;				}			}			likeliness[k] = (float)match/(float)clusters_2d[k].no_plane;			fprintf (fp, "k = %d, likeliness = %.3f\n",k, likeliness[k]);		}		/* find the cluster with max. likelines */		max = 0.0;		maxlike=0;		for (k=0; k<no_cluster; k++)	{			if ((likeliness[k]>=0.1)&&(likeliness[k]>max))	{				max = likeliness[k];				maxlike = k+1;			}		}		fprintf (fp, "max is %.3f\n", max);		dataset->point[n].clust_assg = maxlike;		fprintf (fp, "%d is %d\n", n, dataset->point[n].clust_assg);	}}				/* -----------------------------------------------------------------------	Function	:	print_clustasg	Purpose		:	print the final cluster assignment of each datapoint to the result file	Algorithm	:	for each datapoint					printf the Outlier Group first				for each cluster configuration					for each datapoint						if the datapoint belong to the cluster, print it -----------------------------------------------------------------------------*/void print_clustasg(FILE *output, Dataset *dataset, float left_margin, float right_margin, int no_cluster)	{	long i;	int j;	fprintf (output, "%d\n%d\n", dataset->nopoint, dataset->nodimension);	fprintf (output,  "%.0f\n%.0f\n", left_margin, right_margin);	fprintf (output, "/ Outlier\n");	for (i=0; i<dataset->nopoint; i++)	{		if (dataset->point[i].clust_assg == 0)	{			fprintf (output, "%d\n", i);		}	}		for (j=0; j<no_cluster; j++)	{		fprintf (output, "/ Cluster %d\n", j);		for (i=0; i<dataset->nopoint; i++)	{			if (dataset->point[i].clust_assg == j+1)	{				fprintf (output, "%d\n", i);			}		}	}	}void print_clustconfig_2d(FILE *out, Cluster_2d *cluster, int nocluster){	int i, j;	for (i=0; i<nocluster; i++)	{//		for (j=0; j<cluster[i].no_plane; j++)	{//			fprintf (out, "Plane %d ID %d Dimensions %d %d\n", j, cluster[i].dense_id[j], cluster[i].dimension1[j], cluster[i].dimension2[j]);//		}	}} int main (int argc, char *argv[]){	FILE *datafile,  *resultfile, *outputfile;	/* the FILE pointers */	char buffer[10000];				/* char buffer to store strings from input file */	Dataset dataset;				/* Structure stores and points to information of all data points */	float left_margin;				/* left margin of the whole space (we assume it is the same for all dimensions */	float right_margin;				/* right margin of the whole spae (we assuem it is the same for all dmiensions */	int max_no_cluster;				/* maximum no. of clusters the user is interested to find out */	Histogram_2d *histogram_2d;	Cluster *theclusters;				/* pointer to array of structures storing final cluster configurations */	Cluster_2d *theclusters_2d;	int no_cluster;					/* final no of clusters discovered */	int no_cluster_2d;				/* final no of clusters discovered */		long i;						/* loop index for datapoint id */	int j;						/* loop index */	int k;						/* loop index */	int found=1;					/* for find_Dense_region to indicate whether new dense is discovered (and continue the iteration */	float old_threshold;	float new_threshold;	int current_id;	int no_entry=0;	Signature_Bit_List *signature_bit_list;	Signature_Bit *signature_bit;	Signature_Bit_List *signature_bit_list_sorted;	Signature_Bit_List *signature_bit_list_sorted_combined;	int iteration;#ifndef WIN32	struct rusage start_ru, end_ru; 		/* for user CPU time */#endif	/* Check command line argument */	if (argc!=6)	{		printf ("Usage : %s datafile outputfile resultfile max_no_cluster threshold\n", argv[0]);		exit (1);	}		/* get the max_no_cluster from command line  */	max_no_cluster = (atoi(argv[4]));	/* Open the Files */	datafile = fopen (argv[1], "r");	outputfile = fopen (argv[2], "w");	resultfile = fopen (argv[3], "w");	THRESHOLD = atoi (argv[5]);#ifndef WIN32        /* start time */        getrusage(RUSAGE_SELF, &start_ru);#endif        /* Read the first two line of data file (no_point, dimension) */        fgets(buffer, 10000, datafile);	sscanf (buffer, "%ld", &(dataset.nopoint));        fgets(buffer, 10000, datafile);	sscanf (buffer, "%d", &(dataset.nodimension));        fgets(buffer, 10000, datafile);	sscanf (buffer, "%f", &left_margin);        fgets(buffer, 10000, datafile);	sscanf (buffer, "%f", &right_margin);	rewind(datafile);	/*------------------------------------------------------------------------------*/	/* Memory Allocation 								*/	/* for dataset */	for (i=dataset.nodimension-1; i>0; i--)		no_entry+=i;	/* initialize dataset */	dataset.point = (Point *)malloc (sizeof (Point) * dataset.nopoint);	for (i=0; i<dataset.nopoint; i++)	{		dataset.point[i].component = (float *)malloc(sizeof(float) * dataset.nodimension);		dataset.point[i].signature = (int *)malloc(sizeof(int) * dataset.nodimension);		dataset.point[i].signature_2d = (int *)malloc(sizeof(int) * no_entry);
#ifndef WIN32		bzero (dataset.point[i].component, sizeof (float) * dataset.nodimension);		bzero  (dataset.point[i].signature, sizeof (int ) * dataset.nodimension);		bzero (dataset.point[i].signature_2d, sizeof (int) * no_entry);
#else
		memset (dataset.point[i].component, 0, sizeof (float) * dataset.nodimension);
		memset  (dataset.point[i].signature, 0, sizeof (int ) * dataset.nodimension);
		memset (dataset.point[i].signature_2d, 0, sizeof (int) * no_entry);
#endif		dataset.point[i].clust_assg = 0;	}	/* initialize histogram 2d */	histogram_2d = (Histogram_2d *)malloc(sizeof(Histogram_2d) * dataset.nodimension*dataset.nodimension);	for (j=0; j<dataset.nodimension*dataset.nodimension; j++)	{		histogram_2d[j].density = (int *)malloc (sizeof(int) * NO_INTERVAL*NO_INTERVAL);		histogram_2d[j].dense_id = (int *)malloc (sizeof(int) * NO_INTERVAL*NO_INTERVAL);
#ifndef WIN32		bzero (histogram_2d[j].density, sizeof(int) * NO_INTERVAL*NO_INTERVAL);		bzero (histogram_2d[j].dense_id, sizeof(int) * NO_INTERVAL*NO_INTERVAL);
#else
		memset (histogram_2d[j].density, 0, sizeof(int) * NO_INTERVAL*NO_INTERVAL);
		memset (histogram_2d[j].dense_id, 0, sizeof(int) * NO_INTERVAL*NO_INTERVAL);
#endif		histogram_2d[j].no_dense = 0;		histogram_2d[j].maxid = 0;	}	/* for cluster configuration */	theclusters = (Cluster *)malloc(sizeof(Cluster) * 30*max_no_cluster);	theclusters_2d = (Cluster_2d *)malloc (sizeof (Cluster) * 30 * max_no_cluster);
#ifndef WIN32	bzero (theclusters, sizeof(theclusters));	bzero (theclusters_2d, sizeof(theclusters_2d));
#else
	memset (theclusters, 0, sizeof(theclusters));
	memset (theclusters_2d, 0, sizeof(theclusters_2d));
#endif	for (i=0; i<30*max_no_cluster; i++)	{		theclusters[i].dimension = (int *)malloc(sizeof(int) * dataset.nodimension);		theclusters[i].locationleft = (float *)malloc(sizeof(float) * dataset.nodimension);		theclusters[i].locationright = (float *)malloc(sizeof(float) * dataset.nodimension);		theclusters_2d[i].dimension1 = (int *)malloc(sizeof(int) * dataset.nodimension*dataset.nodimension);		theclusters_2d[i].dimension2 = (int *)malloc(sizeof(int) * dataset.nodimension*dataset.nodimension);		theclusters_2d[i].dense_id = (int *)malloc(sizeof(int) * dataset.nodimension*dataset.nodimension);	}	/* initialize signature_bit_list */	signature_bit_list = (Signature_Bit_List*) malloc (sizeof (Signature_Bit_List) * NO_SLOT);	signature_bit_list_sorted = (Signature_Bit_List*) malloc (sizeof (Signature_Bit_List) * NO_SLOT);	signature_bit_list_sorted->no_signature = 0;	signature_bit_list_sorted->next = (Signature_Bitptr) malloc(sizeof (Signature_Bit));	signature_bit_list_sorted->next->next_signature = NULL;	signature_bit_list_sorted_combined = (Signature_Bit_List*) malloc (sizeof (Signature_Bit_List) * NO_SLOT);	signature_bit_list_sorted_combined->no_signature = 0;	signature_bit_list_sorted_combined->next = (Signature_Bitptr) malloc(sizeof (Signature_Bit));	signature_bit_list_sorted_combined->next->next_signature = NULL;	for (i=0; i<NO_SLOT; i++)	{		signature_bit_list[i].no_signature = 0;		signature_bit_list[i].next = (Signature_Bitptr) malloc (sizeof (Signature_Bit));		signature_bit_list[i].next->next_signature = NULL;;	}			/* Finished Memory Allocation							*/	/* -----------------------------------------------------------------------------*/	/* Read In the Data */	readin_Points(datafile, &dataset);	/* Generate 2-d Histograms */	gen_histogram_2d(outputfile, &dataset, left_margin, right_margin, histogram_2d);	/* For each 2D plane */	/* detect dense region */	for (i=0; i<dataset.nodimension; i++)			for (j=i+1; j<dataset.nodimension; j++)	{			iteration = 0;			new_threshold = find_threshold_histogram_2d (outputfile, histogram_2d, i*dataset.nodimension+j, iteration);			old_threshold = 100000.0;			current_id=1;//			while (old_threshold * pow (0.8, iteration) > new_threshold)	{			while (old_threshold  > new_threshold)	{				old_threshold = new_threshold;				detectdense_2d (outputfile , histogram_2d, i*dataset.nodimension+j, new_threshold, &current_id);				countdense_2d(outputfile, histogram_2d, i*dataset.nodimension+j);				iteration++;				new_threshold = find_threshold_histogram_2d (outputfile, histogram_2d, i*dataset.nodimension+j, iteration);			}							printf ("Plan %d %d has %d dense region \n", i, j, histogram_2d[i*dataset.nodimension+j].no_dense);			if (histogram_2d[i*dataset.nodimension+j].no_dense>0)					print_histogram_2d (outputfile, &dataset, histogram_2d, i*dataset.nodimension+j);		}	/* Deriving 2D Signatures for all datapoints*/	derive_Signature_2d (&dataset, histogram_2d, left_margin, right_margin);	/* Derive and insert Signature BIT for all datapoints */	for (i=0; i<dataset.nopoint; i++)	{		signature_bit = (Signature_Bit *)derive_signature_bit (dataset.point[i].signature_2d, dataset.nodimension, no_entry);		insert_signature_bit (signature_bit_list, signature_bit, dataset.nodimension);	}	sort_signature_bit_score (signature_bit_list, signature_bit_list_sorted);	refine_signature_bit_list (signature_bit_list_sorted, 40*max_no_cluster);		print_signature_bit_list (outputfile, signature_bit_list_sorted, dataset.nodimension);	combine_signature_bit_list (outputfile, signature_bit_list_sorted, dataset.nodimension);	sort_signature_bit_score (signature_bit_list_sorted,signature_bit_list_sorted_combined);	print_signature_bit_list (outputfile, signature_bit_list_sorted_combined, dataset.nodimension);	refine_signature_bit_list (signature_bit_list_sorted_combined, max_no_cluster);	assign_Clusters_2d (signature_bit_list_sorted_combined, theclusters_2d, dataset.nodimension);	assign_Points_2d (outputfile, &dataset, theclusters_2d, signature_bit_list_sorted_combined->no_signature, histogram_2d, left_margin, right_margin);	print_clustasg(resultfile, &dataset, left_margin, right_margin, signature_bit_list_sorted_combined->no_signature);		print_clustconfig_2d (outputfile, theclusters_2d, signature_bit_list_sorted_combined->no_signature);	print_signature_bit_list (outputfile, signature_bit_list_sorted_combined, dataset.nodimension);#ifndef WIN32        /* end time */        getrusage(RUSAGE_SELF, &end_ru);        /* total user CPU time used */        fprintf(outputfile, "\n\n**** total user CPU time used: %d sec.",end_ru.ru_utime.tv_sec-start_ru.ru_utime.tv_sec);#endif	/* close the FILES */	fclose(datafile);	fclose(outputfile);	fclose (resultfile);}	

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -