📄 brfcm.c

📁 这是一个改进的快速实现模糊c-means聚类算法的程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*    brfcm - BitReduction FCM. Reduced precision through quantization   (e.g. bit reduction) and aggregation.   $Id: brfcm.c,v 1.2 2002/07/12 20:48:48 eschrich Exp $   Steven Eschrich      Copyright (C) 2002 University of South Florida   This program is free software; you can redistribute it and/or modify it    under the terms of the GNU General Public License as published by the    Free Software Foundation; either version 2 of the License, or (at    your option) any later version.      This program is distributed in the hope that it will be useful, but    WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    General Public License for more details.      You should have received a copy of the GNU General Public License along    with this program; if not, write to the Free Software Foundation, Inc.,    59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/   #include <math.h>#include <sys/types.h>#include <time.h>#include <sys/time.h>#include <stdio.h>#include <stdlib.h>#include <sys/times.h>#include <sys/resource.h>#include <limits.h>#include <unistd.h>#include <string.h>#include "utils.h"/* Macros defined here. *//* The membership matrix (U) is indexed as U(cluster,example). However,   for efficiency when calculating one example at a time, the arrays   are created as U[example][cluster] so that all operations for a    particular example (e.g. the membership update) occur in contiguous    memory locations.    A macro is used to hide the differences in the code*/#define U(cluster,example) U[example][cluster]/*============================================================== *  * Global variables  *  * There are many global variables in this file. This is mainly * so that individual parameters, counters can be accessed by * external functions (i.e. the calling function). *==============================================================*//*    Several variables are used to calculate the size of the hash   table.    The expected reduction in the dataset is the number of examples   after compression (n*(1-expected_reduction)).   The expected_number_of_collisions determines the size of the hash   table.*/double expected_reduction=0.02;int    expected_number_of_collisions=3;/*    There are a variety of variables needed for a typical FCM clustering   algorithm. *//* (0.225) Epsilon is the stopping condition; a change less than epsilon   in sum squared difference in the U matrix stops the algorithm.*/double epsilon=0.225;/* (2.0) Fuzziness factor. Determines how fuzzy the clustering is */double m=2.0;/* (2) The number of clusters (often seen/called K in kmeans clustering) */int    C=2;/* Data structures to hold clustering information *//* Cluster centroids */double **V;/* Membership matrix */double **U;/* Dataset variables *//* The dataset (examples) */double **X;/* Counter for num iterations */int    number_of_iterations;/* Number of examples */int    N=1;/* Number of features */int    S=1;/* BRFCM specific variables *//* Number of bits to reduce */int    R=0;/* Number of reduced examples */int    N0=0;/* Collision statistic for hashing */int    max_number_of_collisions;/* Flag for turning hashing on/off */Boolean use_hashing=True;/*  BRFCM-specific data struct, Bins. This is the data structure to  hold each quantized value and the associated members.*/typedef struct BinStruct {  double    *X;           /* Full-precision (average) example */  double    *rX;          /* Reduced-precision (quantized) example */  int       *members;     /* Array of indices pointing to quant. members */  long int  w;            /* Number of members (weight) */} Bins;Bins        *bins;/*  A couple of miscellaneous variables */int      *max_value;               /* Stores max feature value per feature */long     seed;                     /* Random seed */struct rusage compressing_usage;   /* Timing for compressing stage *//*============================================================== *  *  Function Prototypes * *==============================================================*//* The main clustering functions */int    brfcm();int    update_centroids();double update_umatrix();/* Utilities */int    init();int    is_example_centroid();double distance();int    distribute_membership();/* brfcm-specific functions (quantization/aggregation) */int reduce();int reduce_vector();int simple_find();int hash_find();int create_new_bin();/* Hash Utilities */int hash_init();int hash_update();int h();int output_centroids();int output_umatrix();int output_members();/*============================================================== *  *  External Function Prototypes * *==============================================================*/int load_test_data();int load_atr_data();int load_mri_data();/*============================================================== *  *  Main Function * *==============================================================*//* For testing purposes, we hard-code the desired number of clusters */#define ATR_NUMBER_OF_CLUSTERS 5#define MRI_NUMBER_OF_CLUSTERS 10#define TEST_NUMBER_OF_CLUSTERS 2#define TEST  1#define ATR   2#define MRI   3/* Global variables */int     dataset_type=MRI;int     write_centroids=0;int     write_umatrix=0;int     write_members=0;/* Variables that must be defined for called functions */int  vals[][3]={{256,256,256},{0,0,0},{256,256,256},{4096,4096,4096}};/* Function prototypes */double *timing_of();  /* Calculate time in seconds */int main(int argc, char **argv){  struct rusage start_usage, end_usage;  int ch;  double *perf_times;  char   *filename;  epsilon=0.225;  m=2.0;  seed=2000;  max_value=vals[dataset_type];  use_hashing=True;  while ( (ch=getopt(argc, argv,"hr:u:w:d:s:")) != EOF ) {    switch (ch) {    case 'h':      fprintf(stderr,"Usage\n" \	      "-r # number of bits to reduce\n"\	      "-u Disable use of hashing for bit reduction\n"\	      "-d [a|t|m|s] Use dataset atr, mri, test, seawifs\n"\	      "-w write cluster centers and memberships out\n"\	      "-s seed  Use seed as the random seed\n");      exit(1);    case 'r': /* number of bits to reduce */      R=atoi(optarg);      break;    case 'u': /* Disable use of hashing. Ultra fast, some pnemonic */      use_hashing=False;      break;    case 'w':      if ( !strcmp(optarg,"umatrix") ) write_umatrix=1;      if ( !strcmp(optarg,"centroids") ) write_centroids=1;      if ( !strcmp(optarg,"members") ) write_members=1;      if ( !strcmp(optarg,"all")) 	write_umatrix=write_centroids=write_members=1;      break;    case 'd':      if ( *optarg == 'a' ) dataset_type=ATR;      if ( *optarg == 'm' ) dataset_type=MRI;      if ( *optarg == 't' ) dataset_type=TEST;      max_value=vals[dataset_type];      break;    case 's':      seed=atol(optarg);      break;    default:    }  }  /* Print out main parameters for this run */  fprintf(stdout,"FCM Parameters\n clusterMethod=brfcm\n");  filename=argv[optind];  fprintf(stdout," file=%s\n",filename);  fprintf(stdout," bit reduction=%d\n use Hashing=%s\n\n",R,use_hashing?"yes":"no");  /* Load the dataset, using one of a particular group of datasets. */  switch (dataset_type) {  case TEST:    load_test_data(&X, &S, &N);    C=TEST_NUMBER_OF_CLUSTERS;    break;  case ATR:    load_atr_data(argv[optind],&X, &S, &N);    C=ATR_NUMBER_OF_CLUSTERS;    break;  case MRI:    load_mri_data(argv[optind], &X, &S, &N);    C=MRI_NUMBER_OF_CLUSTERS;    break;  }  fprintf(stdout, "Beginning to cluster...\n");  /* Time the brfcm algorithm */  getrusage(RUSAGE_SELF, &start_usage);  brfcm();  getrusage(RUSAGE_SELF, &end_usage);  /* Output whatever clustering results we need */  if ( write_centroids ) output_centroids(filename);  if ( write_umatrix   ) output_umatrix(filename);  if ( write_members   ) output_members(filename);     /* Output timing numbers */  perf_times=timing_of(start_usage, compressing_usage);  printf("Compressing Timing: %f user, %f system, %f total.\n", 				perf_times[0], perf_times[1], perf_times[0] +				perf_times[1]);  perf_times=timing_of(compressing_usage, end_usage);  printf("Clustering Timing: %f user, %f system, %f total.\n", 				perf_times[0], perf_times[1], perf_times[0] +				perf_times[1]);  perf_times=timing_of(start_usage, end_usage);  printf("Total Timing: %f user, %f system, %f total.\n", 				perf_times[0], perf_times[1], perf_times[0] +				perf_times[1]);  printf("Clustering required %d iterations.\n", number_of_iterations);  printf("Dataset size %d reduced to %d, %f%% reduction.\n", N, N0,			100.0*(1.0-(N0/(double)N)));  printf("Max. hash table collisions: %d\n", max_number_of_collisions);  return 0;}/************************************************************ *  * Main functions for the file: *  generally call only brfcm() * ************************************************************/int brfcm(){  double sqrerror = 2 * epsilon;    /* initialize counters */  number_of_iterations=0;  /* Reduce dataset before continuing. This routine will set bins     and return the number of reduced vectors. */  N0=reduce();  /* Dynamically allocate storage */  init();    /* Run the updates iteratively */  while (sqrerror > epsilon ) {    number_of_iterations++;    update_centroids();    sqrerror=update_umatrix();  }    /* We go ahead and update the centroids - presumably this will not      change much, since the overall square error in U is small */  update_centroids();      /* Special case for brfcm - distribute reduced vector membership */  distribute_membership();      return 0;}/*    update_centroids()    Given a membership matrix U, recalculate the cluster centroids as the    "weighted" mean of each contributing example from the dataset. Each    example contributes by an amount proportional to the membership value.*/int update_centroids(){  int i,k,x;  double numerator[S], denominator;  double U_ikm;  /* For each cluster */  for (i=0; i < C; i++)  {       /* Zero out numerator and denominator options */    denominator=0;    for (x=0; x < S; x++)       numerator[x]=0;    /* Calculate numerator and denominator together */    for (k=0; k < N0; k++) {      U_ikm=bins[k].w * pow(U(i,k),m);      denominator += U_ikm;      for (x=0; x < S; x++) 	numerator[x] += U_ikm * bins[k].X[x];    }        /* Calculate V */    for (x=0; x < S; x++)       V[i][x]= numerator[x] / denominator;      }  /* endfor: C clusters */  return 0;}double update_umatrix(){  int i,j,k;  int example_is_centroid;  double summation, D_k[C];  double square_difference=0;  double newU;  /* For each example in the dataset */  for ( k=0; k < N0; k++) {        /* Special case: If Example is equal to a Cluster Centroid,       then U=1.0 for that cluster and 0 for all others */    if ( (example_is_centroid=is_example_centroid(k)) != -1 ) {      fprintf(stderr,"Example is centroid\n");      for (i=0; i < C; i++) {	if ( i == example_is_centroid ) {	  square_difference += (U(i,k) -1.0) * (U(i,k)-1.0) * bins[k].w;	  U(i,k)=1.0;	} else {	  square_difference += U(i,k) * U(i,k) * bins[k].w;	  U(i,k)=0.0;	}      }      continue;    }    /* Cache the distance between this vector and all centroids. */    for (i=0; i < C; i++)       D_k[i]=distance(bins[k].X, V[i]);        /* For each class */    for (i=0; i < C; i++) {      summation=0;      /* Calculate summation */      for (j=0; j < C; j++) {	if ( i == j ) 	  summation+=1.0;	else	  summation += pow( D_k[i] / D_k[j] , (2.0/ (m-1)));      }      /* Weight is 1/sum */      newU=1.0/(double)summation;            /* Add to the squareDifference */      square_difference += (U(i,k) - newU) * (U(i,k) - newU) * bins[k].w ;            U(i,k)=newU;    }  } /* endfor N0 */    return square_difference;}/*=================================================================  General Utilities  init()                  - allocate space for data structures  is_example_centroid()   - Compare an example to cluster centroids  is_equal()              - Are two vectors equal (in all dimensions)  distance()              - Distance metric between two vectors  distribute_membership() - Distribute reduced-vector memberships to                             all members of the bin 			     (replace U with a full U).  =================================================================*//* Allocate storage for U and V dynamically. */int init(){  int i,j;  /* Allocate necessary storage */  V=(double **)CALLOC(C,sizeof(double *));  for (i=0; i < C; i++)     V[i]=(double *)CALLOC(S,sizeof(double));  U=(double **)CALLOC(N0, sizeof(double *));  for (i=0; i < N0; i++)    U[i]=(double *)CALLOC(C,sizeof(double));  /* Place random values in V, then update U matrix based on it */  srand48(seed);  for (i=0; i < C; i++) {    for (j=0; j < S; j++) {      V[i][j]=drand48() * max_value[j];    }  }    /* Once values are populated in V, update U matrix to sane values */  update_umatrix();  return 0;}/* If X[k] == V[i] for some i, then return that i. Otherwise, return -1 */int is_example_centroid(int k)
12 下一页
💿 文件大小 28 K
👤 上传用户 spie1
📂 所属分类其他
🏷️ 相关标签

#c-means #快速实现 #模糊 #聚类算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -