📄 kullback_leibler_k_means.cc
字号:
/* Implementation of Kullback-Leibler K-means Copyright (c) 2002, Yuqiang Guan*/#include <time.h>#include <assert.h>#include <iostream.h>#include <stdio.h>#include <fstream.h>#include <string.h>#include <math.h>#include "Kullback_leibler_k_means.h"/* Constructor */Kullback_leibler_k_means::Kullback_leibler_k_means(Matrix *p_docs, int cluster[], int ini_num_clusters, int est_start, float Omiga, bool random_seeding, int seed, float epsilon): Gmeans(p_docs, cluster, ini_num_clusters, est_start, Omiga, random_seeding, seed, epsilon){ Sim_Mat = new VECTOR_double[n_Clusters]; for (int j = 0; j < n_Clusters; j++) Sim_Mat[j] = new float[n_Docs]; memory_consume+=n_Clusters*n_Docs*sizeof(float); prior = new float [ini_num_clusters]; C_log_C = cluster_quality; memory_consume+=(ini_num_clusters)*sizeof(float); k_means_threshold= p_docs->MutualInfo()*epsilon; }/* Destructor */Kullback_leibler_k_means::~Kullback_leibler_k_means(){ delete [] prior;}/* Kullback-Leibler K-means functions */void Kullback_leibler_k_means::general_k_means(Matrix *p_Docs){ int n_Iters, i; bool no_assignment_change; if(dumpswitch) cout<<endl<<"- Start Kullback-Leibler K-Means loop. -"<<endl<<endl; n_Iters =0; no_assignment_change =true; stablized = false; fv_threshold = -1.0*p_Docs->MutualInfo()*Delta; do { do { pre_Result = Result; n_Iters ++; if ( assign_cluster(p_Docs, stablized) == 0 ) { if (dumpswitch) cout<<"No points are moved in the last step "<<endl<<"@"<<endl<<endl; p_Docs->SetAlpha(p_Docs->GetAlpha()/2.0, laplacian); } else { compute_cluster_size(); no_assignment_change = false; p_Docs->SetAlpha(p_Docs->GetAlpha()/2.0, laplacian); update_centroids(p_Docs); if(stablized) { } else { for ( i=0; i< n_Clusters; i++) p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]); } Result = p_Docs->GetNormSum() - coherence(n_Clusters)/ log(2.0); if(dumpswitch) { find_worst_vectors(false); cout<<"Obj. func. ( + Omiga * n_Clusters) = "<<Result<<endl<<"@"<<endl<<endl; if (verify) cout<<"Verify obj. func. : "<<verify_obj_func(p_Docs,n_Clusters)<<endl; if (obj_inc.good()) obj_inc<<"\t"<<Result/p_Docs->MutualInfo()<<"\t"<<p_Docs->GetAlpha()*2.0<<endl; } else cout<<"K"; } } while ((pre_Result - Result) > k_means_threshold); if ((dumpswitch) && (obj_inc.good())) obj_inc<<"%%%%% K-means stops!"<<endl; } while (p_Docs->GetAlpha() >Epsilon); cout<<endl; //p_Docs->SetAlpha(0.0, laplacian); if (dumpswitch) { cout <<"Kullback-Leibler K-Means loop stoped with "<<n_Iters<<" iterations."<<endl; generate_Confusion_Matrix (label, n_Class); } if (stablized) { } if ((!no_assignment_change) && (f_v_times >0)) for (i=0; i<n_Clusters; i++) update_quality_change_mat(p_Docs, i);}int Kullback_leibler_k_means::assign_cluster(Matrix *p_Docs, bool simi_est){ int i,j, k, multi=0, changed=0, temp_Cluster_ID; float temp_sim; k=0; if(simi_est) { } else { for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp_sim = Sim_Mat[Cluster[i]][i]; temp_Cluster_ID = Cluster[i]; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { multi++; if (Sim_Mat[j][i] <temp_sim ) { temp_sim = Sim_Mat[j][i]; temp_Cluster_ID = j; } } if (temp_Cluster_ID != Cluster[i]) { Cluster[i] = temp_Cluster_ID; Sim_Mat[Cluster[i]][i] = temp_sim; changed++; } i++; } k++; } } if(dumpswitch) { cout << multi << " dot product computation\n"; cout << changed << " assignment changes\n"; } return changed;}void Kullback_leibler_k_means::compute_sum_log(int i){ int j; C_log_C[i] =0; for (j = 0; j < n_Words; j++) if (Concept_Vector[i][j]>0) C_log_C[i] +=Concept_Vector[i][j]*log(Concept_Vector[i][j]); if (prior[i] >0) C_log_C[i] -= prior[i]*log(prior[i]); }void Kullback_leibler_k_means::update_centroids(Matrix *p_Docs){ int i, j, k; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; k=0; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]); i++; } k++; } break; case PRIOR_LAPLACE: for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = +ClusterSize[i]; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]); i++; } k++; } break; } for (i = 0; i < n_Clusters; i++) { prior[i] = norm_1(Concept_Vector[i], n_Words); compute_sum_log(i); }}/* initialization functions */void Kullback_leibler_k_means::initialize_cv(Matrix *p_Docs, char * seeding_file){ int i, j; switch (init_clustering) { case RANDOM_PERTURB_INIT: random_perturb_init(p_Docs); break; case RANDOM_CLUSTER_ID_INIT: random_cluster_ID(); break; case CONCEPT_VECTORS_INIT: random_centroids(p_Docs); break; case WELL_SEPARATED_CENTROID_INIT: well_separated_centroids(p_Docs, 0); break; case WELL_SEPARATED_CENTROID_INIT_MODIFY: well_separated_centroids(p_Docs, 1); break; case SEEDINGFILE_INIT: seeding_file_init(p_Docs, seeding_file); break; case ALREADY_ASSIGNED: break; default: random_cluster_ID(); break; } // reset concept vectors for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: for (i = 0; i < n_Docs; i++) if((Cluster[i] >=0) && (Cluster[i] < n_Clusters))//marked cluster[i]<0 points as seeds p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]); else Cluster[i] =0; break; case PRIOR_LAPLACE: for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = +ClusterSize[i]; for (i = 0; i < n_Docs; i++) if((Cluster[i] >=0) && (Cluster[i] < n_Clusters))//marked cluster[i]<0 points as seeds p_Docs->ith_add_CV_prior(i, Concept_Vector[Cluster[i]]); else Cluster[i] =0; break; } for (i = 0; i < n_Clusters; i++) { prior[i]= norm_1(Concept_Vector[i], n_Words); compute_sum_log(i); } compute_cluster_size(); if (laplacian == PRIOR_LAPLACE) for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] += ClusterSize[i]; for (i = 0; i < n_Clusters; i++) p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]); //cout<<coherence(n_Clusters)/log(2.0)<<endl; //cout<<p_Docs->GetNormSum()<<endl; Result = p_Docs->GetNormSum() - coherence(n_Clusters)/log(2.0); if(dumpswitch || evaluate) { outputClusterSize(); cout<<"Initial Obj. func.: "<<Result<<endl; if (n_Class >0) cout<<"Initial confusion matrix :"<<endl; generate_Confusion_Matrix (label, n_Class); } if(evaluate) { purity_Entropy_MutInfo(); F_measure(label, n_Class); micro_avg_precision_recall(); cout<<"The mutual information of the original matrix is : " <<p_Docs->MutualInfo()<<endl; cout<<"The mutual information of the final matrix is : "<<p_Docs->MutualInfo()-Result<<" (" <<(p_Docs->MutualInfo()-Result)*100.0/p_Docs->MutualInfo()<<"%)"<<endl; cout<<endl; cout <<"* Evaluation done. *"<<endl; exit (0); } if (f_v_times >0) { quality_change_mat = new (float *)[n_Clusters]; for (int j = 0; j < n_Clusters; j++) quality_change_mat [j] = new float[n_Docs]; memory_consume+=(n_Clusters*n_Docs)*sizeof(float); for (i = 0; i < n_Clusters; i++) update_quality_change_mat (p_Docs, i); } memory_consume+=p_Docs->GetMemoryUsed();}void Kullback_leibler_k_means::well_separated_centroids(Matrix *p_Docs, int choice){int i, j, k, max_ind, *cv = new int [n_Clusters];float max, cos_sum; bool *mark = new bool [n_Docs]; for (i=0; i< n_Docs; i++) mark[i] = false; for (i=0; i< n_Empty_Docs; i++) mark[empty_Docs_ID[i]] = true; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; break; case PRIOR_LAPLACE: for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 1.0; break; } switch (choice) { case 0: do{ cv[0] = rand_gen.GetUniformInt(n_Docs); }while(mark[cv[0]]); if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a random chosen vector"<<endl; } break; case 1: default: float *v; v = new float[n_Words]; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: for (i = 0; i < n_Words; i++) v[i] = 0.0; break; case PRIOR_LAPLACE: for (i = 0; i < n_Words; i++) v[i] = n_Docs; break; } for (i = 0; i < n_Docs; i++) p_Docs->ith_add_CV_prior(i, v);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -