📄 kullback_leibler_k_means.cc

📁 gmeans-- Clustering with first variation and splitting 文本聚类算法Gmeans ,使用了3种相似度函数,cosine,euclidean ,K
💻 CC
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*   Implementation of Kullback-Leibler K-means      Copyright (c) 2002, Yuqiang Guan*/#include <time.h>#include <assert.h>#include <iostream.h>#include <stdio.h>#include <fstream.h>#include <string.h>#include <math.h>#include "Kullback_leibler_k_means.h"/* Constructor */Kullback_leibler_k_means::Kullback_leibler_k_means(Matrix *p_docs, int cluster[], int ini_num_clusters, int est_start, float Omiga, bool random_seeding, int seed, float epsilon): Gmeans(p_docs, cluster, ini_num_clusters, est_start, Omiga, random_seeding, seed, epsilon){  Sim_Mat = new VECTOR_double[n_Clusters];  for (int j = 0; j < n_Clusters; j++)    Sim_Mat[j] = new float[n_Docs];   memory_consume+=n_Clusters*n_Docs*sizeof(float);  prior = new float [ini_num_clusters];  C_log_C = cluster_quality;  memory_consume+=(ini_num_clusters)*sizeof(float);  k_means_threshold= p_docs->MutualInfo()*epsilon;  }/* Destructor */Kullback_leibler_k_means::~Kullback_leibler_k_means(){  delete [] prior;}/* Kullback-Leibler K-means functions */void Kullback_leibler_k_means::general_k_means(Matrix *p_Docs){  int n_Iters, i;  bool no_assignment_change;  if(dumpswitch)    cout<<endl<<"- Start Kullback-Leibler K-Means loop. -"<<endl<<endl;  n_Iters =0;  no_assignment_change =true;  stablized = false;  fv_threshold = -1.0*p_Docs->MutualInfo()*Delta;  do    {      do	{	  pre_Result = Result;	  n_Iters ++;	  	  if ( assign_cluster(p_Docs, stablized) == 0 )	    {	      if (dumpswitch)		cout<<"No points are moved in the last step "<<endl<<"@"<<endl<<endl;	      p_Docs->SetAlpha(p_Docs->GetAlpha()/2.0, laplacian);	    }	  else	    {	      compute_cluster_size();	      no_assignment_change = false;	      p_Docs->SetAlpha(p_Docs->GetAlpha()/2.0, laplacian);	      update_centroids(p_Docs);	      	      if(stablized)		{		  		}	      else		{		  for ( i=0; i< n_Clusters; i++)		    p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]);		}	      	      Result = p_Docs->GetNormSum() - coherence(n_Clusters)/ log(2.0); 	      if(dumpswitch)		{		  find_worst_vectors(false);		  cout<<"Obj. func. ( + Omiga * n_Clusters) = "<<Result<<endl<<"@"<<endl<<endl;		  if (verify)		    cout<<"Verify obj. func. : "<<verify_obj_func(p_Docs,n_Clusters)<<endl;		  if (obj_inc.good())		    obj_inc<<"\t"<<Result/p_Docs->MutualInfo()<<"\t"<<p_Docs->GetAlpha()*2.0<<endl;		}	      else		cout<<"K";	    }	}      while ((pre_Result - Result) > k_means_threshold);      if ((dumpswitch) && (obj_inc.good()))	obj_inc<<"%%%%% K-means stops!"<<endl;    }  while (p_Docs->GetAlpha() >Epsilon);  cout<<endl;       //p_Docs->SetAlpha(0.0, laplacian);    if (dumpswitch)  {    cout <<"Kullback-Leibler K-Means loop stoped with "<<n_Iters<<" iterations."<<endl;    generate_Confusion_Matrix (label, n_Class);       }    if (stablized)    {    }    if ((!no_assignment_change) && (f_v_times >0))    for (i=0; i<n_Clusters; i++)      update_quality_change_mat(p_Docs, i);}int Kullback_leibler_k_means::assign_cluster(Matrix *p_Docs, bool simi_est){  int i,j, k, multi=0, changed=0,  temp_Cluster_ID;  float temp_sim;  k=0;  if(simi_est)    {          }  else    {      for (i = 0; i < n_Docs; i++)	{	  while (i<empty_Docs_ID[k])	    {	      temp_sim = Sim_Mat[Cluster[i]][i];	      temp_Cluster_ID = Cluster[i];	      	      for (j = 0; j < n_Clusters; j++)		if (j != Cluster[i])		  {		    multi++;		    if (Sim_Mat[j][i] <temp_sim )		      {			temp_sim = Sim_Mat[j][i];			temp_Cluster_ID = j;		      }		  }	      if (temp_Cluster_ID != Cluster[i])		{		  Cluster[i] = temp_Cluster_ID;		  Sim_Mat[Cluster[i]][i] = temp_sim;		  changed++;		}   	      i++;	    }	  k++;	}     }  if(dumpswitch)    {      cout << multi << " dot product computation\n";      cout << changed << " assignment changes\n";    }  return changed;}void Kullback_leibler_k_means::compute_sum_log(int i){  int j;  C_log_C[i] =0;  for (j = 0; j < n_Words; j++)    if (Concept_Vector[i][j]>0)      C_log_C[i] +=Concept_Vector[i][j]*log(Concept_Vector[i][j]);  if (prior[i] >0)    C_log_C[i] -= prior[i]*log(prior[i]);  }void Kullback_leibler_k_means::update_centroids(Matrix *p_Docs){  int i, j, k;    for (i = 0; i < n_Clusters; i++)    for (j = 0; j < n_Words; j++)      Concept_Vector[i][j] = 0.0;    k=0;  switch (laplacian)    {    case NOLAPLACE:    case CENTER_LAPLACE:      for (i = 0; i < n_Docs; i++)	{  	  while (i<empty_Docs_ID[k])	    {	      p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]);	      i++;	    }	  k++;	}      break;    case PRIOR_LAPLACE:      for (i = 0; i < n_Clusters; i++)	for (j = 0; j < n_Words; j++)	  Concept_Vector[i][j] = +ClusterSize[i];              for (i = 0; i < n_Docs; i++)	{  	  while (i<empty_Docs_ID[k])	    {	      p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]);	      i++;	    }	  k++;	}      break;    }  for (i = 0; i < n_Clusters; i++)    {      prior[i] = norm_1(Concept_Vector[i], n_Words);      compute_sum_log(i);    }}/* initialization functions */void Kullback_leibler_k_means::initialize_cv(Matrix *p_Docs, char * seeding_file){  int i, j;    switch (init_clustering)    {    case RANDOM_PERTURB_INIT:      random_perturb_init(p_Docs);      break;    case RANDOM_CLUSTER_ID_INIT:      random_cluster_ID();      break;    case CONCEPT_VECTORS_INIT:      random_centroids(p_Docs);      break;    case WELL_SEPARATED_CENTROID_INIT:      well_separated_centroids(p_Docs, 0);      break;    case WELL_SEPARATED_CENTROID_INIT_MODIFY:      well_separated_centroids(p_Docs, 1);      break;    case SEEDINGFILE_INIT:      seeding_file_init(p_Docs, seeding_file);      break;    case ALREADY_ASSIGNED:      break;    default:      random_cluster_ID();      break;    }  // reset concept vectors    for (i = 0; i < n_Clusters; i++)    for (j = 0; j < n_Words; j++)      Concept_Vector[i][j] = 0.0;    switch (laplacian)    {    case NOLAPLACE:    case CENTER_LAPLACE:      for (i = 0; i < n_Docs; i++)	if((Cluster[i] >=0) && (Cluster[i] < n_Clusters))//marked cluster[i]<0 points as seeds	  p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]);      	else	  Cluster[i] =0;      break;    case PRIOR_LAPLACE:      for (i = 0; i < n_Clusters; i++)	for (j = 0; j < n_Words; j++)	  Concept_Vector[i][j] = +ClusterSize[i];              for (i = 0; i < n_Docs; i++)	if((Cluster[i] >=0) && (Cluster[i] < n_Clusters))//marked cluster[i]<0 points as seeds	  p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]);	else	  Cluster[i] =0;      break;    }  for (i = 0; i < n_Clusters; i++)    {      prior[i]= norm_1(Concept_Vector[i], n_Words);        compute_sum_log(i);    }    compute_cluster_size();      if (laplacian == PRIOR_LAPLACE)    for (i = 0; i < n_Clusters; i++)      for (j = 0; j < n_Words; j++)	Concept_Vector[i][j] += ClusterSize[i];    for (i = 0; i < n_Clusters; i++)    p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]);     //cout<<coherence(n_Clusters)/log(2.0)<<endl;   //cout<<p_Docs->GetNormSum()<<endl;    Result = p_Docs->GetNormSum() - coherence(n_Clusters)/log(2.0);     if(dumpswitch || evaluate)    {      outputClusterSize();      cout<<"Initial Obj. func.: "<<Result<<endl;      if (n_Class >0)	cout<<"Initial confusion matrix :"<<endl;      generate_Confusion_Matrix (label, n_Class);    }   if(evaluate)    {      purity_Entropy_MutInfo();      F_measure(label, n_Class);      micro_avg_precision_recall();      cout<<"The mutual information of the original matrix is : " <<p_Docs->MutualInfo()<<endl;      cout<<"The mutual information of the final matrix is    : "<<p_Docs->MutualInfo()-Result<<" ("	  <<(p_Docs->MutualInfo()-Result)*100.0/p_Docs->MutualInfo()<<"%)"<<endl;      cout<<endl;      cout <<"* Evaluation done. *"<<endl;      exit (0);    }  if (f_v_times >0)    {      quality_change_mat = new (float *)[n_Clusters];      for (int j = 0; j < n_Clusters; j++)	quality_change_mat [j] = new float[n_Docs];       memory_consume+=(n_Clusters*n_Docs)*sizeof(float);      for (i = 0; i < n_Clusters; i++)	update_quality_change_mat (p_Docs, i);    }   memory_consume+=p_Docs->GetMemoryUsed();}void Kullback_leibler_k_means::well_separated_centroids(Matrix *p_Docs, int choice){int i, j, k, max_ind, *cv = new int [n_Clusters];float max, cos_sum;  bool *mark = new bool [n_Docs];  for (i=0; i< n_Docs; i++)    mark[i] = false;  for (i=0; i< n_Empty_Docs; i++)    mark[empty_Docs_ID[i]] = true;  switch (laplacian)    {    case NOLAPLACE:    case CENTER_LAPLACE:      for (i = 0; i < n_Clusters; i++)	for (j = 0; j < n_Words; j++)	  Concept_Vector[i][j] = 0.0;     break;    case PRIOR_LAPLACE:      for (i = 0; i < n_Clusters; i++)	for (j = 0; j < n_Words; j++)	  Concept_Vector[i][j] = 1.0;      break;    }  switch (choice)    {    case 0:      do{	cv[0] = rand_gen.GetUniformInt(n_Docs);      }while(mark[cv[0]]);      if(dumpswitch)	{	  cout<<"Cluster centroids are chosen to be well separated from each other."<<endl;	  cout<<"Start with a random chosen vector"<<endl;	}       break;    case 1:    default:      float *v;      v = new float[n_Words];      switch (laplacian)	{	case NOLAPLACE:	case CENTER_LAPLACE:	  for (i = 0; i < n_Words; i++)	    v[i] = 0.0;	  break;	case PRIOR_LAPLACE:	  for (i = 0; i < n_Words; i++)	    v[i] = n_Docs;	  break;	}      for (i = 0; i < n_Docs; i++)	p_Docs->ith_add_CV_prior(i, v);
12 3 下一页
💿 文件大小 71 K
👤 上传用户 tianlin4431
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#Clustering #euclidean #variation #splitting
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -