📄 databin.h
字号:
/* Ant-based Clustering Copyright (C) 2004 Julia Handl Email: Julia.Handl@gmx.de This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*//***************************************************date: 7.4.2003author: Julia Handl (julia.Handl@gmx.de)description: - wrapper class for data- data input- data normalisation- precomputation of dissimilarity matrix***************************************************/#ifndef DATABIN_JH_2003#define DATABIN_JH_2003#include "conf.h"#include "tmatrix.h"#include "databin.h"#include <fstream>#include "testset.h"#include "math.h"#include "random.h"template <class BINTYPE> class databin;template <class DBINTYPE> class docbin;/*************************************************** class data***************************************************/#define NRANSI#define MAXBIT 30#define MAXDIM 6static int iminarg1,iminarg2;#define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))template <class DATATYPE>class data { friend class databin<DATATYPE>; private: DATATYPE * vector; protected: conf * par; public: int color; int cluster; public: ~data(); /* Constructor for a data item that is initialized to the vector <d> and assigned cluster number <cluster> and color <color> */ data(conf * c, DATATYPE * d, int color, int cluster); /* Default constructor */ data(conf * c); /* Constructor for a data item that is initialized to the vector <d> */ data(conf * c, DATATYPE * d); /* Lenght of the data vector */ const int length(); DATATYPE square(DATATYPE x); /* Write-Read access to individual components */ DATATYPE &operator[](const int i); /* Distance computation between two data items */ const DATATYPE distanceto(data<DATATYPE> & d); /* Addition of two data vectors */ void add(data<DATATYPE> & d); /* Division of a data vector by <i> */ void div(int i); /* Data vector is set to <d> */ void set(data<DATATYPE> & d); /* Data vector is set to <d> */ void set(DATATYPE * d); friend ostream & operator<< <DATATYPE>(ostream &, data<DATATYPE> &);};/*************************************************** class databin***************************************************/template <class BINTYPE>class databin { protected: /* pointer to current parameter settings */ conf * par; /* array of pointer to data objects */ data<BINTYPE> ** bin; public: /* precomputed disimilarity matrix */ tmatrix<BINTYPE> * distancematrix; /* array of maxvalue for each attribute */ BINTYPE * maxvalue; BINTYPE * minvalue; BINTYPE * mean; BINTYPE * std; public: /* Constructor, hard-coded data collection as described by <name> */ databin(conf * c, char * name); /* Destructor */ ~databin(); /* Distance computation between two data items in the collection */ const BINTYPE d(const int index1, const int index2); /* Direct access to precomputed dissimilarity matrix */ const BINTYPE precomputed_d(const int index1, const int index2); /* Write-Read access to an individual data item in the collection */ data<BINTYPE> & operator[](const int i); /* Data output */ void dataoutput(char * filename); void distanceoutput(char * name, double mu); /* Check whether this class label already exists */ int find(char * templabel, char ** classlabel, int & labelctr); /* Generate new samples from the Normal Distribution (for artificial data) */ void regenerate(char * name); /* Permute order of data items */ void permutate();};/*************************************************** class data - function definitions***************************************************/// access to vector lengthtemplate <class DATATYPE>const int data <DATATYPE>::length() { return par->bindim;} // default constructortemplate <class DATATYPE>data <DATATYPE>::data(conf * c) { par = c; vector = new DATATYPE[par->bindim]; color = 0; cluster = 0; for (int i=0; i<par->bindim; i++) { vector[i] = 0; }} // constructor if data vector is providedtemplate <class DATATYPE>data <DATATYPE>:: data(conf * c, DATATYPE * dat) { par = c; vector = new DATATYPE[par->bindim]; color = 0; cluster = 0; for (int i=0; i<par->bindim; i++) { vector[i] = dat[i]; }} // constructor if data vector is providedtemplate <class DATATYPE>data <DATATYPE>:: data(conf * c, DATATYPE * dat, int col, int cl ) { par = c; vector = new DATATYPE[par->bindim]; color = col; cluster = cl; for (int i=0; i<par->bindim; i++) { vector[i] = dat[i]; }}// destructortemplate <class DATATYPE>data <DATATYPE>::~data() { delete [] vector;}// square functiontemplate <class DATATYPE> DATATYPE data <DATATYPE>::square(DATATYPE x) { return x*x;}// write-read access to coordinates of dat vector template <class DATATYPE> DATATYPE &data <DATATYPE>::operator[](const int i) { return vector[i];}// distance function defined between data vectors template <class DATATYPE>const DATATYPE data <DATATYPE>::distanceto(data<DATATYPE> & dd) { data & d = (data &)dd; DATATYPE result = 0.0; for (int i=0; i<par->bindim; i++) { result += square(d.vector[i] - vector[i]); } return sqrt(result);}// addition of data vectors template <class DATATYPE>void data <DATATYPE>::add(data<DATATYPE> & d) { for (int i=0; i<par->bindim; i++) { vector[i] += d.vector[i]; } }template <class DATATYPE> void data<DATATYPE>::set(data<DATATYPE> & d) { for (int i=0; i<par->bindim; i++) { vector[i] = d.vector[i]; }}template <class DATATYPE> void data<DATATYPE>::set(DATATYPE * d) { for (int i=0; i<par->bindim; i++) { vector[i] = d[i]; }}// division of a data vector by an integer template <class DATATYPE>void data <DATATYPE>::div(int divisor) { for (int i=0; i<par->bindim; i++) { vector[i] /= double(divisor); }}/*************************************************** class databin - function definitions***************************************************/// destructortemplate <class BINTYPE>databin <BINTYPE>::~databin() {#ifndef RANDDATA for (int i=0; i<par->binsize; i++) { delete bin[i]; } delete [] bin; delete [] maxvalue;#endif delete distancematrix; // cout << "Databin Destructor" << endl;}template <class BINTYPE> const BINTYPE databin <BINTYPE>::precomputed_d(const int index1, const int index2) { return (*distancematrix)(index1,index2);}// distance function between the bin's data itemstemplate <class BINTYPE>inline const BINTYPE databin <BINTYPE>::d(const int index1, const int index2) { return bin[index1]->distanceto(*bin[index2]);}// write-read access to data itemstemplate <class BINTYPE>inline data<BINTYPE> & databin <BINTYPE>::operator[](const int i) { return *bin[i];} template <class BINTYPE> int databin <BINTYPE>::find(char * templabel, char ** classlabel, int & labelctr) { for (int i=0; i<labelctr; i++) { if (strcmp(classlabel[i],templabel) == 0) { return i; } } strcpy(classlabel[labelctr], templabel); labelctr++; cout << "New Label: " << templabel << endl; return labelctr-1;} // constructor for class databin if a hard-coded test set (identified by <name>) is usedtemplate <class BINTYPE>databin <BINTYPE>::databin(conf * c, char * name) { par = c; testset t(name, c); t.generate(); par->bindim = 2; par->imax = (int)sqrt(double(par->binsize * 10)); par->jmax = par->imax; par->maxspeed = int(sqrt(2.0*0.5*par->imax*0.5*par->imax)); par->generations = max(25,int(double(par->binsize) /20)); par->kclusters = par->num_cluster;#ifdef CLUSTERING par->imax_som = 1; par->jmax_som = par->kclusters;#endif#ifdef TOPMAPPING par->imax_som = (int)sqrt(par->binsize); par->jmax_som = (int)sqrt(par->binsize);#endif maxvalue = new USED_DATA_TYPE[par->bindim]; minvalue = new USED_DATA_TYPE[par->bindim]; mean = new USED_DATA_TYPE[par->bindim]; std = new USED_DATA_TYPE[par->bindim]; for (int j=0; j<par->bindim; j++) { mean[j] = 0.0; for (int i=0; i<par->binsize; i++) { mean[j] += t.point_coordinates[i][j]; } } for (int j=0; j<par->bindim; j++) { mean[j] /= double(par->binsize); } // compute standard deviation for (int j=0; j<par->bindim; j++) { std[j] = 0.0; for (int i=0; i<par->binsize; i++) { double diff = t.point_coordinates[i][j]-mean[j]; std[j] += diff*diff; } std[j] /= par->binsize; std[j] = sqrt(std[j]); } for (int j=0; j<par->bindim; j++) { for (int i=0; i<par->binsize; i++) { t.point_coordinates[i][j] -= mean[j]; t.point_coordinates[i][j] /= std[j]; } } bin = new data<USED_DATA_TYPE>*[par->binsize]; if (bin == NULL) { cerr << "Databin: Memory allocation failed" << endl; exit(0); } int ctr = 0; int color = 0; for (int k=0; k<par->bindim; k++) { maxvalue[k] = -100000000.0; minvalue[k] = 100000000.0; } for (int i=0; i<par->num_cluster; i++) { for (int j=0; j<par->size_cluster[i]; j++) { bin[ctr] = new data<USED_DATA_TYPE>(par, t.point_coordinates[ctr],color+1, color); if (bin[ctr] == NULL) { cerr << "Databin: Memory allocation failed" << endl; exit(0); } for (int k=0; k<par->bindim; k++) { maxvalue[k] = max(maxvalue[k], (*(bin[ctr]))[k]); minvalue[k] = min(minvalue[k], (*(bin[ctr]))[k]); } ctr++; } color++; } distancematrix = new tmatrix<BINTYPE>(par->binsize); if (distancematrix == NULL) { cerr << "Databin: Memory allocation failed" << endl; exit(0); } // print the original document positions to a gnuplot file dataoutput("initialdata.dat"); // compute distances and mean par->mu = 0.0; par->max = 0.0; for (int i=0; i<par->binsize; i++) { for (int j=0; j<i; j++) { (*distancematrix)(i,j) = bin[i]->distanceto(*(bin[j])); par->mu += (*distancematrix)(i,j); par->max = max(par->max, (*distancematrix)(i,j)); } } par->mu /= 0.5*(par->binsize-1)*par->binsize; // normalize data for (int i=0; i<par->binsize; i++) { for (int j=0; j<i; j++) { (*distancematrix)(i,j) = (*distancematrix)(i,j) / par->max; } } par->max /= par->max; par->mu /= par->max; }template <class BINTYPE> void databin <BINTYPE>::permutate() { BINTYPE tempval; long idum = rand(); for (int i=0; i<par->binsize; i++) { int j = int(ran0(&idum)*(par->binsize)); data<BINTYPE> * temp = bin[i]; bin[i] = bin[j]; bin[j] = temp; for (int k=0; k<par->binsize; k++) { if ((k != i) && (k != j)) { tempval = (*distancematrix)(i,k); (*distancematrix)(i,k) = (*distancematrix)(j,k); (*distancematrix)(j,k) = tempval; } } } }template <class BINTYPE> void databin <BINTYPE>::regenerate(char * name) { testset t(name, par); t.generate(); for (int j=0; j<par->bindim; j++) { mean[j] = 0.0; for (int i=0; i<par->binsize; i++) { mean[j] += t.point_coordinates[i][j]; } } for (int j=0; j<par->bindim; j++) { mean[j] /= par->binsize; } // compute standard deviation for (int j=0; j<par->bindim; j++) { std[j] = 0.0; for (int i=0; i<par->binsize; i++) { double diff = t.point_coordinates[i][j]-mean[j]; std[j] += diff*diff; } std[j] /= par->binsize; std[j] = sqrt(std[j]); } int ctr = 0; int color = 0; for (int k=0; k<par->bindim; k++) { maxvalue[k] = 0.0; } for (int i=0; i<par->num_cluster; i++) { for (int j=0; j<par->binsize/par->num_cluster; j++) { bin[ctr] = new data<USED_DATA_TYPE>(par, t.point_coordinates[ctr],color+1, color); for (int k=0; k<par->bindim; k++) { maxvalue[k] = max(maxvalue[k], abs((*(bin[ctr]))[k])); } ctr++; } color++; } // compute distances and mean par->mu = 0.0; par->max = 0.0; for (int i=0; i<par->binsize; i++) { for (int j=0; j<i; j++) { (*distancematrix)(i,j) = bin[i]->distanceto(*(bin[j])); par->mu += (*distancematrix)(i,j); par->max = max(par->max, (*distancematrix)(i,j)); } } par->mu /= 0.5*(par->binsize-1)*par->binsize; // normalize data for (int i=0; i<par->binsize; i++) { for (int j=0; j<i; j++) { (*distancematrix)(i,j) = (*distancematrix)(i,j) / par->max; } } par->mu /= par->max; }template <class BINTYPE> void databin <BINTYPE>::dataoutput(char * name) { ofstream datastream(name); for (int i=0; i<par->binsize; i++) { for (int j=0; j<par->bindim; j++) { datastream << (*(bin[i]))[j] << " "; } datastream << (*(bin[i])).cluster << endl; }}template <class BINTYPE> void databin <BINTYPE>::distanceoutput(char * name, double mu) { int size = 100; double step = par->max / mu / double(size); int histo[size]; for (int i=0; i<size; i++) histo[i] = 0; ofstream datastream(name); for (int i=0; i<par->binsize; i++) { for (int j=0; j<i; j++) { histo[int((*distancematrix)(i,j) / mu / step) ]++; } } for (int i=0; i<size; i++) { datastream << double(i)*step << " " << histo[i] << endl; }} template <class DATATYPE>ostream & operator<<(ostream & s, data<DATATYPE> & d) { for (int i=0; i<d.length(); i++) { s.precision(3); s << d.vector[i] << " "; } s << endl; return s;}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -