⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spherical_k_means.cc

📁 gmeans-- Clustering with first variation and splitting 文本聚类算法Gmeans ,使用了3种相似度函数,cosine,euclidean ,K
💻 CC
📖 第 1 页 / 共 2 页
字号:
/*   Implementation of Sperical K-means      Copyright (c) 2003, Yuqiang Guan*/#include <time.h>#include <assert.h>#include <iostream.h>#include <stdio.h>#include <fstream.h>#include <string.h>#include <math.h>#include "Spherical_k_means.h"/* Constructor */Spherical_k_means::Spherical_k_means(Matrix *p_docs, int cluster[], int ini_num_clusters, int est_start, float Omiga, bool random_seeding, int seed, float epsilon) : Gmeans(p_docs, cluster, ini_num_clusters, est_start, Omiga, random_seeding, seed, epsilon){  Sim_Mat = new VECTOR_double[n_Clusters];  for (int j = 0; j < n_Clusters; j++)    Sim_Mat[j] = new float[n_Docs];   memory_consume+=n_Clusters*n_Docs*sizeof(float);}/* Spherical K-means functions */void Spherical_k_means::general_k_means(Matrix *p_Docs){  int n_Iters, i, j;   if(dumpswitch)    cout<<endl<<"- Start Spherical K-Means loop. -"<<endl<<endl;  n_Iters =0;  no_assignment_change =true;  do    {      pre_Result = Result;      n_Iters ++;      if (n_Iters >EST_START)	stablized =true;      if(dumpswitch && stablized)	cout <<"(Similarity estimation used.)"<<endl;      if ( assign_cluster(p_Docs, stablized) == 0 )	{	  if (dumpswitch)	    cout<<"No points are moved in the last step "<<endl<<"@"<<endl<<endl;	}      else	{	  compute_cluster_size();	  no_assignment_change = false;	  if( stablized)	    for (i = 0; i < n_Clusters; i++)	      for (j = 0; j < n_Words; j++)		old_CV[i][j] = Concept_Vector[i][j];	  	  update_centroids(p_Docs);	  	  if(stablized)	    {	      for (i = 0; i < n_Clusters; i++)		{		  diff[i] = 0.0;		  for (j = 0; j < n_Words; j++)		    diff[i] += (old_CV[i][j] - Concept_Vector[i][j]) * (old_CV[i][j] - Concept_Vector[i][j]);		  diff[i] = sqrt(diff[i]);		}	      for ( i=0; i<n_Docs; i++)		Sim_Mat[Cluster[i]][i] = p_Docs->dot_mult (Concept_Vector[Cluster[i]], i);	    }	  else	    for (i = 0; i < n_Clusters; i++)	      p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]);      	  Result = coherence(n_Clusters); 	  if(dumpswitch)	    {	      find_worst_vectors(false);	      cout<<"Obj. func. ( + Omiga * n_Clusters) = "<<Result<<endl<<"@"<<endl<<endl;	      if (verify)		cout<<"Verify obj. func. : "<<verify_obj_func(p_Docs,n_Clusters)<<endl;	    }	  else	    cout<<"S";	}    }  while ((Result - pre_Result) > Epsilon*Result);  cout<<endl;   if (dumpswitch)  {    cout <<"Spherical K-Means loop stoped with "<<n_Iters<<" iterations."<<endl;    generate_Confusion_Matrix (label, n_Class);   }    if (stablized)    for (i = 0; i < n_Clusters; i++)      p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]);    if ((!no_assignment_change) && (f_v_times >0))    for (i=0; i<n_Clusters; i++)      update_quality_change_mat(p_Docs, i);}int Spherical_k_means::assign_cluster(Matrix *p_Docs, bool simi_est){  int i,j, k, multi=0, changed=0,  temp_Cluster_ID;  float temp_sim;  k=0;  if(simi_est)    {      for (i = 0; i < n_Clusters; i++)	for (j = 0; j < n_Docs; j++)	  if (i != Cluster[j])	    Sim_Mat[i][j] += diff[i];	       for (i = 0; i < n_Docs; i++)	{	  while (i<empty_Docs_ID[k])	    {	      temp_sim = Sim_Mat[Cluster[i]][i];	      temp_Cluster_ID = Cluster[i];	      for (j = 0; j < n_Clusters; j++)		if (j != Cluster[i])		  {		    if (Sim_Mat[j][i] > temp_sim)		      {			multi++;			Sim_Mat[j][i] = p_Docs->dot_mult(Concept_Vector[j], i);			if (Sim_Mat[j][i] > temp_sim)			  {			    temp_sim = Sim_Mat[j][i];			    temp_Cluster_ID = j;			  }		      }		  }		  	      if (temp_Cluster_ID != Cluster[i])		{		  Cluster[i] = temp_Cluster_ID;		  Sim_Mat[Cluster[i]][i] = temp_sim;		  changed++;		}   	      i++;	    }	  k++;	}    }  else    {      for (i = 0; i < n_Docs; i++)	{	  while (i<empty_Docs_ID[k])	    {	      temp_sim = Sim_Mat[Cluster[i]][i];	      temp_Cluster_ID = Cluster[i];	      	      for (j = 0; j < n_Clusters; j++)		if (j != Cluster[i])		  {		    multi++;		    if (Sim_Mat[j][i]>temp_sim )		      {			temp_sim = Sim_Mat[j][i];			temp_Cluster_ID = j;		      }		  }	      if (temp_Cluster_ID != Cluster[i])		{		  Cluster[i] = temp_Cluster_ID;		  Sim_Mat[Cluster[i]][i] = temp_sim;		  changed++;		}   	      i++;	    }	  k++;	}     }  if(dumpswitch)    {      cout << multi << " dot product computation\n";      cout << changed << " assignment changes\n";    }  return changed;}void Spherical_k_means::update_centroids(Matrix *p_Docs){  int i, j, k;  for (i = 0; i < n_Clusters; i++)    for (j = 0; j < n_Words; j++)      Concept_Vector[i][j] = 0.0;  k=0;  for (i = 0; i < n_Docs; i++)    {        while (i<empty_Docs_ID[k])	{	  p_Docs->ith_add_CV(i, Concept_Vector[Cluster[i]]);	  i++;	}      k++;    }   for (i = 0; i < n_Clusters; i++)     CV_norm[i]=normalize_vec(Concept_Vector[i], n_Words);	  }/* initialization functions */void Spherical_k_means::initialize_cv(Matrix *p_Docs, char * seeding_file){  int i, j;    switch (init_clustering)    {    case RANDOM_PERTURB_INIT:      random_perturb_init(p_Docs);      break;    case RANDOM_CLUSTER_ID_INIT:      random_cluster_ID();      break;    case CONCEPT_VECTORS_INIT:      random_centroids(p_Docs);      break;    case WELL_SEPARATED_CENTROID_INIT:      well_separated_centroids(p_Docs, 0);      break;    case WELL_SEPARATED_CENTROID_INIT_MODIFY:      well_separated_centroids(p_Docs, 1);      break;    case SEEDINGFILE_INIT:      seeding_file_init(p_Docs, seeding_file);      break;    case ALREADY_ASSIGNED:      break;    default:      well_separated_centroids(p_Docs, 1);      break;    }  // reset concept vectors    for (i = 0; i < n_Clusters; i++)    for (j = 0; j < n_Words; j++)      Concept_Vector[i][j] = 0.0;  for (i = 0; i < n_Docs; i++)    {      if((Cluster[i] >=0) && (Cluster[i] < n_Clusters)) //marked cluster[i]<0 points as seeds	p_Docs->ith_add_CV(i, Concept_Vector[Cluster[i]]);      else	Cluster[i] =0;    }  compute_cluster_size();  for (i = 0; i < n_Clusters; i++)    CV_norm[i]=normalize_vec(Concept_Vector[i], n_Words);  for (i = 0; i < n_Clusters; i++)    p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]);   //for (i = 0; i < n_Clusters; i++)  //diff[i] = 0.0;  // because we need give the coherence here.    Result = coherence(n_Clusters);  if(dumpswitch || evaluate)    {      outputClusterSize();      cout<<"Initial Obj. func.: "<<Result<<endl;      if (n_Class >0)	cout<<"Initial confusion matrix :"<<endl;      generate_Confusion_Matrix (label, n_Class);    }  if(evaluate)    {      purity_Entropy_MutInfo();      F_measure(label, n_Class);      micro_avg_precision_recall();      cout<<endl;      cout <<"* Evaluation done. *"<<endl;      exit (0);    }  if (f_v_times >0)    {      quality_change_mat = new (float *)[n_Clusters];      for (int j = 0; j < n_Clusters; j++)	quality_change_mat [j] = new float[n_Docs];      memory_consume+=(n_Clusters*n_Docs)*sizeof(float);      for (i = 0; i < n_Clusters; i++)	update_quality_change_mat (p_Docs, i);    }   memory_consume+=p_Docs->GetMemoryUsed();}void Spherical_k_means::well_separated_centroids(Matrix *p_Docs, int choice){  int i, j, k, min_ind, *cv = new int [n_Clusters];  float min, cos_sum;  bool *mark = new bool [n_Docs];  for (i=0; i< n_Docs; i++)    mark[i] = false;  for (i=0; i< n_Empty_Docs; i++)    mark[empty_Docs_ID[i]] = true;  for (i = 0; i < n_Clusters; i++)    for (j = 0; j < n_Words; j++)      Concept_Vector[i][j] = 0.0;       switch (choice)    {    case 0:      do{	cv[0] = rand_gen.GetUniformInt(n_Docs);      }while (mark[cv[0]]);      if(dumpswitch)	{	  cout<<"Cluster centroids are chosen to be well separated from each other."<<endl;	  cout<<"Start with a random chosen vector"<<endl;	}       break;    case 1:    default:      float *v, min;      int min_ID=0;      v = new float[n_Words];             for (i = 0; i < n_Words; i++)	v[i] = 0.0;      for (i = 0; i < n_Docs; i++)	p_Docs->ith_add_CV(i, v);	      float temp;      k=0;      normalize_vec(v, n_Words);      min =1.0;      min_ID =0;      for(i=0; i<n_Docs; i++)	{	  while (i<empty_Docs_ID[k])	    {	      temp = p_Docs->dot_mult(v, i);	      if ( temp < min )		{		  min = temp;		  min_ID = i;		}	      i++;	    }	  k++;	}      cv[0] = min_ID;      delete [] v;      if(dumpswitch)	{	  cout<<"Cluster centroids are chosen to be well separated from each other."<<endl;	  cout<<"Start with a vector farthest from the centroid of the whole data set"<<endl;	}       break;    }   p_Docs->ith_add_CV(cv[0], Concept_Vector[0]);  mark[cv[0]] = true;    p_Docs->trans_mult(Concept_Vector[0], Sim_Mat[0]);  for (i=1; i<n_Clusters; i++)    {      min_ind = 0;      min = 2.0 * n_Clusters;      for (j=0; j<n_Docs; j++)	{	  if(!mark[j])	    {	      cos_sum = 0;	      for (k=0; k<i; k++)		cos_sum += Sim_Mat[k][j];	      if (cos_sum < min)		{		  min = cos_sum;		  min_ind = j;		}	    }	}      cv[i] = min_ind;      p_Docs->ith_add_CV(cv[i], Concept_Vector[i]);            p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]);      mark[cv[i]] = true;    }       for(i=0; i<n_Docs; i++)    Cluster[i] =0;  assign_cluster(p_Docs, false);    if(dumpswitch)    {      cout<<"Vectors chosen to be the centroids are : ";      for (i=0; i<n_Clusters; i++)	cout<<cv[i]<<" ";      cout<<endl;    }  delete [] cv;  delete [] mark;}void Spherical_k_means::random_centroids(Matrix *p_Docs){  int i, j, *cv = new int [n_Clusters];  bool *mark= new bool[n_Docs];    for (i=0; i< n_Docs; i++)    mark[i] = false;  for (i = 0; i < n_Clusters; i++)    for (j = 0; j < n_Words; j++)      Concept_Vector[i][j] = 0.0;    for (i=0; i<n_Clusters; i++)    {      do{	cv[i] = rand_gen.GetUniformInt(n_Docs);      }      while (mark[cv[i]]);      mark[cv[i]] = true;      p_Docs->ith_add_CV(cv[i], Concept_Vector[i]);    }  if(dumpswitch)    {      cout<<"Randomly chose centroids"<<endl;      cout<<"Vectors chosen to be the centroids are : ";      for (i=0; i<n_Clusters; i++)	cout<<cv[i]<<" ";      cout<<endl;    }  delete [] mark;  delete [] cv;  for (i = 0; i < n_Clusters; i++)    p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]);      for(i=0; i<n_Docs; i++)    Cluster[i] =0;  assign_cluster(p_Docs, false);}void Spherical_k_means::random_perturb_init(Matrix *p_Docs){  VECTOR_double v, temp_v;  int i, j;  float temp_norm;    v = new float[n_Words];  temp_v = new float[n_Words];    for (i = 0; i < n_Words; i++)    v[i] = 0.0;  for (i = 0; i < n_Docs; i++)    p_Docs->ith_add_CV(i, v);  normalize_vec(v, n_Words);  for (i = 0; i < n_Clusters; i++)    for (j = 0; j < n_Words; j++)      Concept_Vector[i][j] = 0.0;    for (i = 0; i < n_Clusters; i++)    {      for (j = 0; j < n_Words; j++)	temp_v[j] = rand_gen.GetUniform() - 0.5;      normalize_vec(temp_v, n_Words);      temp_norm = Perturb * rand_gen.GetUniform();      for (j = 0; j < n_Words; j++)	temp_v[j] *= temp_norm;      for (j = 0; j < n_Words; j++)	Concept_Vector[i][j] += fabs(v[j]*(1+ temp_v[j]));    }    delete [] v;  delete [] temp_v;  for (i = 0; i < n_Clusters; i++)    CV_norm[i] = normalize_vec(Concept_Vector[i], n_Words);  for (i = 0; i < n_Clusters; i++)    p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]);         for(i=0; i<n_Docs; i++)    Cluster[i] =0;  assign_cluster(p_Docs, false);  if(dumpswitch)    {      cout<<"Generated the centroid of whole data set then perturb around it"<<endl;     }}/* first variations functions */float Spherical_k_means::K_L_first_variation(Matrix *p_Docs)  /* we are gonna use sum vector instead of concept vector in FV */{  int i, j, max_change_index=-1;  float *change= new float [f_v_times], *total_change =new float [f_v_times], max_change=0.0, pre_change;  float *old_CV_norm = new float[n_Clusters];  one_step *track = new (one_step) [f_v_times];    bool *been_converted = new bool [n_Clusters];  int *original_Cluster =new int [f_v_times], *old_ClusterSize =new int [n_Clusters];

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -