📄 brfcm.c
字号:
/* brfcm - BitReduction FCM. Reduced precision through quantization (e.g. bit reduction) and aggregation. $Id: brfcm.c,v 1.2 2002/07/12 20:48:48 eschrich Exp $ Steven Eschrich Copyright (C) 2002 University of South Florida This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ #include <math.h>#include <sys/types.h>#include <time.h>#include <sys/time.h>#include <stdio.h>#include <stdlib.h>#include <sys/times.h>#include <sys/resource.h>#include <limits.h>#include <unistd.h>#include <string.h>#include "utils.h"/* Macros defined here. *//* The membership matrix (U) is indexed as U(cluster,example). However, for efficiency when calculating one example at a time, the arrays are created as U[example][cluster] so that all operations for a particular example (e.g. the membership update) occur in contiguous memory locations. A macro is used to hide the differences in the code*/#define U(cluster,example) U[example][cluster]/*============================================================== * * Global variables * * There are many global variables in this file. This is mainly * so that individual parameters, counters can be accessed by * external functions (i.e. the calling function). *==============================================================*//* Several variables are used to calculate the size of the hash table. The expected reduction in the dataset is the number of examples after compression (n*(1-expected_reduction)). The expected_number_of_collisions determines the size of the hash table.*/double expected_reduction=0.02;int expected_number_of_collisions=3;/* There are a variety of variables needed for a typical FCM clustering algorithm. *//* (0.225) Epsilon is the stopping condition; a change less than epsilon in sum squared difference in the U matrix stops the algorithm.*/double epsilon=0.225;/* (2.0) Fuzziness factor. Determines how fuzzy the clustering is */double m=2.0;/* (2) The number of clusters (often seen/called K in kmeans clustering) */int C=2;/* Data structures to hold clustering information *//* Cluster centroids */double **V;/* Membership matrix */double **U;/* Dataset variables *//* The dataset (examples) */double **X;/* Counter for num iterations */int number_of_iterations;/* Number of examples */int N=1;/* Number of features */int S=1;/* BRFCM specific variables *//* Number of bits to reduce */int R=0;/* Number of reduced examples */int N0=0;/* Collision statistic for hashing */int max_number_of_collisions;/* Flag for turning hashing on/off */Boolean use_hashing=True;/* BRFCM-specific data struct, Bins. This is the data structure to hold each quantized value and the associated members.*/typedef struct BinStruct { double *X; /* Full-precision (average) example */ double *rX; /* Reduced-precision (quantized) example */ int *members; /* Array of indices pointing to quant. members */ long int w; /* Number of members (weight) */} Bins;Bins *bins;/* A couple of miscellaneous variables */int *max_value; /* Stores max feature value per feature */long seed; /* Random seed */struct rusage compressing_usage; /* Timing for compressing stage *//*============================================================== * * Function Prototypes * *==============================================================*//* The main clustering functions */int brfcm();int update_centroids();double update_umatrix();/* Utilities */int init();int is_example_centroid();double distance();int distribute_membership();/* brfcm-specific functions (quantization/aggregation) */int reduce();int reduce_vector();int simple_find();int hash_find();int create_new_bin();/* Hash Utilities */int hash_init();int hash_update();int h();int output_centroids();int output_umatrix();int output_members();/*============================================================== * * External Function Prototypes * *==============================================================*/int load_test_data();int load_atr_data();int load_mri_data();/*============================================================== * * Main Function * *==============================================================*//* For testing purposes, we hard-code the desired number of clusters */#define ATR_NUMBER_OF_CLUSTERS 5#define MRI_NUMBER_OF_CLUSTERS 10#define TEST_NUMBER_OF_CLUSTERS 2#define TEST 1#define ATR 2#define MRI 3/* Global variables */int dataset_type=MRI;int write_centroids=0;int write_umatrix=0;int write_members=0;/* Variables that must be defined for called functions */int vals[][3]={{256,256,256},{0,0,0},{256,256,256},{4096,4096,4096}};/* Function prototypes */double *timing_of(); /* Calculate time in seconds */int main(int argc, char **argv){ struct rusage start_usage, end_usage; int ch; double *perf_times; char *filename; epsilon=0.225; m=2.0; seed=2000; max_value=vals[dataset_type]; use_hashing=True; while ( (ch=getopt(argc, argv,"hr:u:w:d:s:")) != EOF ) { switch (ch) { case 'h': fprintf(stderr,"Usage\n" \ "-r # number of bits to reduce\n"\ "-u Disable use of hashing for bit reduction\n"\ "-d [a|t|m|s] Use dataset atr, mri, test, seawifs\n"\ "-w write cluster centers and memberships out\n"\ "-s seed Use seed as the random seed\n"); exit(1); case 'r': /* number of bits to reduce */ R=atoi(optarg); break; case 'u': /* Disable use of hashing. Ultra fast, some pnemonic */ use_hashing=False; break; case 'w': if ( !strcmp(optarg,"umatrix") ) write_umatrix=1; if ( !strcmp(optarg,"centroids") ) write_centroids=1; if ( !strcmp(optarg,"members") ) write_members=1; if ( !strcmp(optarg,"all")) write_umatrix=write_centroids=write_members=1; break; case 'd': if ( *optarg == 'a' ) dataset_type=ATR; if ( *optarg == 'm' ) dataset_type=MRI; if ( *optarg == 't' ) dataset_type=TEST; max_value=vals[dataset_type]; break; case 's': seed=atol(optarg); break; default: } } /* Print out main parameters for this run */ fprintf(stdout,"FCM Parameters\n clusterMethod=brfcm\n"); filename=argv[optind]; fprintf(stdout," file=%s\n",filename); fprintf(stdout," bit reduction=%d\n use Hashing=%s\n\n",R,use_hashing?"yes":"no"); /* Load the dataset, using one of a particular group of datasets. */ switch (dataset_type) { case TEST: load_test_data(&X, &S, &N); C=TEST_NUMBER_OF_CLUSTERS; break; case ATR: load_atr_data(argv[optind],&X, &S, &N); C=ATR_NUMBER_OF_CLUSTERS; break; case MRI: load_mri_data(argv[optind], &X, &S, &N); C=MRI_NUMBER_OF_CLUSTERS; break; } fprintf(stdout, "Beginning to cluster...\n"); /* Time the brfcm algorithm */ getrusage(RUSAGE_SELF, &start_usage); brfcm(); getrusage(RUSAGE_SELF, &end_usage); /* Output whatever clustering results we need */ if ( write_centroids ) output_centroids(filename); if ( write_umatrix ) output_umatrix(filename); if ( write_members ) output_members(filename); /* Output timing numbers */ perf_times=timing_of(start_usage, compressing_usage); printf("Compressing Timing: %f user, %f system, %f total.\n", perf_times[0], perf_times[1], perf_times[0] + perf_times[1]); perf_times=timing_of(compressing_usage, end_usage); printf("Clustering Timing: %f user, %f system, %f total.\n", perf_times[0], perf_times[1], perf_times[0] + perf_times[1]); perf_times=timing_of(start_usage, end_usage); printf("Total Timing: %f user, %f system, %f total.\n", perf_times[0], perf_times[1], perf_times[0] + perf_times[1]); printf("Clustering required %d iterations.\n", number_of_iterations); printf("Dataset size %d reduced to %d, %f%% reduction.\n", N, N0, 100.0*(1.0-(N0/(double)N))); printf("Max. hash table collisions: %d\n", max_number_of_collisions); return 0;}/************************************************************ * * Main functions for the file: * generally call only brfcm() * ************************************************************/int brfcm(){ double sqrerror = 2 * epsilon; /* initialize counters */ number_of_iterations=0; /* Reduce dataset before continuing. This routine will set bins and return the number of reduced vectors. */ N0=reduce(); /* Dynamically allocate storage */ init(); /* Run the updates iteratively */ while (sqrerror > epsilon ) { number_of_iterations++; update_centroids(); sqrerror=update_umatrix(); } /* We go ahead and update the centroids - presumably this will not change much, since the overall square error in U is small */ update_centroids(); /* Special case for brfcm - distribute reduced vector membership */ distribute_membership(); return 0;}/* update_centroids() Given a membership matrix U, recalculate the cluster centroids as the "weighted" mean of each contributing example from the dataset. Each example contributes by an amount proportional to the membership value.*/int update_centroids(){ int i,k,x; double numerator[S], denominator; double U_ikm; /* For each cluster */ for (i=0; i < C; i++) { /* Zero out numerator and denominator options */ denominator=0; for (x=0; x < S; x++) numerator[x]=0; /* Calculate numerator and denominator together */ for (k=0; k < N0; k++) { U_ikm=bins[k].w * pow(U(i,k),m); denominator += U_ikm; for (x=0; x < S; x++) numerator[x] += U_ikm * bins[k].X[x]; } /* Calculate V */ for (x=0; x < S; x++) V[i][x]= numerator[x] / denominator; } /* endfor: C clusters */ return 0;}double update_umatrix(){ int i,j,k; int example_is_centroid; double summation, D_k[C]; double square_difference=0; double newU; /* For each example in the dataset */ for ( k=0; k < N0; k++) { /* Special case: If Example is equal to a Cluster Centroid, then U=1.0 for that cluster and 0 for all others */ if ( (example_is_centroid=is_example_centroid(k)) != -1 ) { fprintf(stderr,"Example is centroid\n"); for (i=0; i < C; i++) { if ( i == example_is_centroid ) { square_difference += (U(i,k) -1.0) * (U(i,k)-1.0) * bins[k].w; U(i,k)=1.0; } else { square_difference += U(i,k) * U(i,k) * bins[k].w; U(i,k)=0.0; } } continue; } /* Cache the distance between this vector and all centroids. */ for (i=0; i < C; i++) D_k[i]=distance(bins[k].X, V[i]); /* For each class */ for (i=0; i < C; i++) { summation=0; /* Calculate summation */ for (j=0; j < C; j++) { if ( i == j ) summation+=1.0; else summation += pow( D_k[i] / D_k[j] , (2.0/ (m-1))); } /* Weight is 1/sum */ newU=1.0/(double)summation; /* Add to the squareDifference */ square_difference += (U(i,k) - newU) * (U(i,k) - newU) * bins[k].w ; U(i,k)=newU; } } /* endfor N0 */ return square_difference;}/*================================================================= General Utilities init() - allocate space for data structures is_example_centroid() - Compare an example to cluster centroids is_equal() - Are two vectors equal (in all dimensions) distance() - Distance metric between two vectors distribute_membership() - Distribute reduced-vector memberships to all members of the bin (replace U with a full U). =================================================================*//* Allocate storage for U and V dynamically. */int init(){ int i,j; /* Allocate necessary storage */ V=(double **)CALLOC(C,sizeof(double *)); for (i=0; i < C; i++) V[i]=(double *)CALLOC(S,sizeof(double)); U=(double **)CALLOC(N0, sizeof(double *)); for (i=0; i < N0; i++) U[i]=(double *)CALLOC(C,sizeof(double)); /* Place random values in V, then update U matrix based on it */ srand48(seed); for (i=0; i < C; i++) { for (j=0; j < S; j++) { V[i][j]=drand48() * max_value[j]; } } /* Once values are populated in V, update U matrix to sane values */ update_umatrix(); return 0;}/* If X[k] == V[i] for some i, then return that i. Otherwise, return -1 */int is_example_centroid(int k)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -