📄 euclidean_k_means.cc
字号:
/* Implementation of Euclidean K-means Copyright (c) 2003, Yuqiang Guan*/#include <time.h>#include <assert.h>#include <iostream.h>#include <stdio.h>#include <fstream.h>#include <string.h>#include <math.h>#include "Euclidean_k_means.h"/* Constructor and Distructor*/Euclidean_k_means::Euclidean_k_means(Matrix *p_docs, int cluster[], int ini_num_clusters, int est_start, float Omiga, bool random_seeding, int seed, float epsilon) : Gmeans(p_docs, cluster, ini_num_clusters, est_start, Omiga, random_seeding, seed, epsilon){ Sim_Mat = new VECTOR_double[n_Clusters]; for (int j = 0; j < n_Clusters; j++) Sim_Mat[j] = new float[n_Docs]; memory_consume+=n_Clusters*n_Docs*sizeof(float); norm_CV = new float [ini_num_clusters];}Euclidean_k_means::~Euclidean_k_means(){ delete [] norm_CV;}/* Exponent kernel k-means functions */void Euclidean_k_means::general_k_means(Matrix *p_Docs){ int n_Iters, i, j, k; bool no_assignment_change; if(dumpswitch) cout<<endl<<"- Start Euclidean K-Means loop. -"<<endl<<endl; n_Iters =0; no_assignment_change =true; do { pre_Result = Result; n_Iters ++; if (n_Iters >EST_START) stablized =true; if(dumpswitch && stablized) cout <<"(Similarity estimation used.)"<<endl; if ( assign_cluster(p_Docs, stablized) ==0 ) { if (dumpswitch) cout<<"No points are moved in the last step "<<endl<<"@"<<endl<<endl; } else { no_assignment_change = false; compute_cluster_size(); if( n_Iters >= EST_START ) for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) old_CV[i][j] = Concept_Vector[i][j]; update_centroids(p_Docs); for (i = 0; i < n_Clusters; i++) average_vec(Concept_Vector[i], n_Words, ClusterSize[i]); for (i = 0; i < n_Clusters; i++) norm_CV[i] = norm_2(Concept_Vector[i], n_Words); if( n_Iters >= EST_START ) { for (i = 0; i < n_Clusters; i++) { diff[i] = 0.0; for (j = 0; j < n_Words; j++) diff[i] += (old_CV[i][j] - Concept_Vector[i][j]) * (old_CV[i][j] - Concept_Vector[i][j]); // diff[i] = diff[i] - 2*sqrt(diff[i]*Sim_Mat[Cluster[i]][i]); } if ( n_Iters > EST_START ) for ( i=0; i<n_Docs; i++) Sim_Mat[Cluster[i]][i] = p_Docs->euc_dis(Concept_Vector[Cluster[i]],i, norm_CV[Cluster[i]]); else for (i = 0; i < n_Clusters; i++) p_Docs->euc_dis(Concept_Vector[i], norm_CV[i], Sim_Mat[i]); } else for (i = 0; i < n_Clusters; i++) p_Docs->euc_dis(Concept_Vector[i], norm_CV[i], Sim_Mat[i]); for (i=0; i<n_Clusters; i++) cluster_quality[i] =0.0; k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { cluster_quality[Cluster[i]] += Sim_Mat[Cluster[i]][i]; i++; } k++; } Result = coherence(n_Clusters); if(dumpswitch) { find_worst_vectors(false); cout<<"Obj. func. ( + Omiga * n_Clusters) = "<<Result<<endl<<"@"<<endl<<endl; if (verify) cout<<"Verify obj. func. : "<<verify_obj_func(p_Docs,n_Clusters)<<endl; } else cout<<"E"; } } while ((pre_Result-Result) > Epsilon*initial_obj_fun_val); cout<<endl; if (dumpswitch) { cout <<"Euclidean K-Means loop stoped with "<<n_Iters<<" iterations."<<endl; generate_Confusion_Matrix (label, n_Class); } if (stablized) for (i = 0; i < n_Clusters; i++) p_Docs->euc_dis(Concept_Vector[i], norm_CV[i], Sim_Mat[i]); if ((!no_assignment_change) && (f_v_times >0)) for (i=0; i<n_Clusters; i++) update_quality_change_mat(p_Docs, i);}int Euclidean_k_means::assign_cluster(Matrix *p_Docs, bool simi_est){ int i,j, k, multi=0, changed=0, temp_Cluster_ID; float temp_sim; k=0; if(simi_est) { for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Docs; j++) if (i != Cluster[j]) Sim_Mat[i][j] += diff[i] - 2*sqrt(diff[i]*Sim_Mat[i][j]); for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp_sim = Sim_Mat[Cluster[i]][i]; temp_Cluster_ID = Cluster[i]; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { if (Sim_Mat[j][i] < temp_sim) { multi++; Sim_Mat[j][i] = p_Docs->euc_dis(Concept_Vector[j], i, norm_CV[j]); if (Sim_Mat[j][i] < temp_sim) { temp_sim = Sim_Mat[j][i]; temp_Cluster_ID = j; } } } if (temp_Cluster_ID != Cluster[i]) { Cluster[i] = temp_Cluster_ID; Sim_Mat[Cluster[i]][i] = temp_sim; changed++; } i++; } k++; } } else { for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp_sim = Sim_Mat[Cluster[i]][i]; temp_Cluster_ID = Cluster[i]; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { multi++; if (Sim_Mat[j][i] < temp_sim ) { temp_sim = Sim_Mat[j][i]; temp_Cluster_ID = j; } } if (temp_Cluster_ID != Cluster[i]) { Cluster[i] = temp_Cluster_ID; Sim_Mat[Cluster[i]][i] = temp_sim; changed++; } i++; } k++; } } if(dumpswitch) { cout << multi << " Euclidean distance computation\n"; cout << changed << " assignment changes\n"; } return changed;}void Euclidean_k_means::update_centroids(Matrix *p_Docs){ int i, j, k; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { p_Docs->ith_add_CV(i, Concept_Vector[Cluster[i]]); i++; } k++; }}/* initialization functions */void Euclidean_k_means::initialize_cv(Matrix *p_Docs, char * seeding_file){ int i, j, k; switch (init_clustering) { case RANDOM_PERTURB_INIT: random_perturb_init(p_Docs); break; case RANDOM_CLUSTER_ID_INIT: random_cluster_ID(); break; case CONCEPT_VECTORS_INIT: random_centroids(p_Docs); break; case WELL_SEPARATED_CENTROID_INIT: well_separated_centroids(p_Docs, 0); break; case WELL_SEPARATED_CENTROID_INIT_MODIFY: well_separated_centroids(p_Docs, 1); break; case SEEDINGFILE_INIT: seeding_file_init(p_Docs, seeding_file); break; case ALREADY_ASSIGNED: break; default: well_separated_centroids(p_Docs, 1); break; } // reset concept vectors for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i = 0; i < n_Docs; i++) { if((Cluster[i] >=0) && (Cluster[i] < n_Clusters)) p_Docs->ith_add_CV(i, Concept_Vector[Cluster[i]]); else Cluster[i] =0; } compute_cluster_size(); for (i = 0; i < n_Clusters; i++) average_vec(Concept_Vector[i], n_Words, ClusterSize[i]); for (i = 0; i < n_Clusters; i++) norm_CV[i] = norm_2(Concept_Vector[i], n_Words); for (i = 0; i < n_Clusters; i++) p_Docs->euc_dis(Concept_Vector[i], norm_CV[i], Sim_Mat[i]); for (i=0; i<n_Clusters; i++) cluster_quality[i] =0.0; k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { cluster_quality[Cluster[i]] += Sim_Mat[Cluster[i]][i]; i++; } k++; } //for (i = 0; i < n_Clusters; i++) //diff[i] = 0.0; // because we need give the coherence here. initial_obj_fun_val= Result = coherence(n_Clusters); fv_threshold = -1.0*initial_obj_fun_val*Delta; if(dumpswitch || evaluate) { outputClusterSize(); cout<<"Initial Obj. func.: "<<Result<<endl; if (n_Class >0) cout<<"Initial confusion matrix :"<<endl; generate_Confusion_Matrix (label, n_Class); } if(evaluate) { purity_Entropy_MutInfo(); F_measure(label, n_Class); micro_avg_precision_recall(); cout<<endl; cout <<"* Evaluation done. *"<<endl; exit (0); } if (f_v_times >0) { quality_change_mat = new (float *)[n_Clusters]; for (int j = 0; j < n_Clusters; j++) quality_change_mat [j] = new float[n_Docs]; memory_consume+=(n_Clusters*n_Docs)*sizeof(float); for (i = 0; i < n_Clusters; i++) update_quality_change_mat (p_Docs, i); } memory_consume+=p_Docs->GetMemoryUsed();}void Euclidean_k_means::well_separated_centroids(Matrix *p_Docs, int choice){ int i, j, k, min_ind, *cv = new int [n_Clusters]; float min, cos_sum; bool *mark = new bool [n_Docs]; for (i=0; i< n_Docs; i++) mark[i] = false; for (i=0; i< n_Empty_Docs; i++) mark[empty_Docs_ID[i]] = true; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; switch (choice) { case 0: do{ cv[0] = rand_gen.GetUniformInt(n_Docs); }while (mark[cv[0]]); if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a random chosen vector"<<endl; } break; case 1: default: float *v, min; int min_ID=0; v = new float[n_Words]; for (i = 0; i < n_Words; i++) v[i] = 0.0; for (i = 0; i < n_Docs; i++) p_Docs->ith_add_CV(i, v); float temp, temp_norm; k=0; average_vec(v, n_Words, n_Docs); temp_norm = norm_2(v, n_Words); min =0.0; min_ID =0; for(i=0; i<n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp = p_Docs->euc_dis(v, i, temp_norm); if ( temp > min ) { min = temp; min_ID = i; } i++; } k++; } cv[0] = min_ID; delete [] v; if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a vector farthest from the centroid of the whole data set"<<endl; } break; } p_Docs->ith_add_CV(cv[0], Concept_Vector[0]); mark[cv[0]] = true; norm_CV[0] = p_Docs->GetNorm(cv[0]); p_Docs->euc_dis(Concept_Vector[0], norm_CV[0], Sim_Mat[0]); for (i=1; i<n_Clusters; i++) { min_ind = 0; min = 0.0; for (j=0; j<n_Docs; j++) { if(!mark[j]) { cos_sum = 0.0; for (k=0; k<i; k++) cos_sum += Sim_Mat[k][j]; if (cos_sum > min) { min = cos_sum; min_ind = j; } } } cv[i] = min_ind; p_Docs->ith_add_CV(cv[i], Concept_Vector[i]); norm_CV[i] = p_Docs->GetNorm(cv[i]); p_Docs->euc_dis(Concept_Vector[i], norm_CV[i], Sim_Mat[i]); mark[cv[i]] = true; } for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false); if(dumpswitch) { cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] cv; delete [] mark;}void Euclidean_k_means::random_centroids(Matrix *p_Docs){ int i, j, *cv = new int [n_Clusters]; bool *mark= new bool[n_Docs]; for (i=0; i< n_Docs; i++) mark[i] = false; for (i=0; i< n_Empty_Docs; i++) mark[empty_Docs_ID[i]] = true; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i=0; i<n_Clusters; i++) { do{ cv[i] = rand_gen.GetUniformInt(n_Docs); } while (mark[cv[i]]); mark[cv[i]] = true; p_Docs->ith_add_CV(cv[i], Concept_Vector[i]); } if(dumpswitch) { cout<<"Randomly chose centroids"<<endl; cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] mark; delete [] cv; for (i = 0; i < n_Clusters; i++) norm_CV[i] = p_Docs->GetNorm(cv[i]); for (i = 0; i < n_Clusters; i++) p_Docs->euc_dis(Concept_Vector[i], norm_CV[i], Sim_Mat[i]); for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false);}void Euclidean_k_means::random_perturb_init(Matrix *p_Docs){ VECTOR_double v, temp_v; int i, j; float temp_norm; v = new float[n_Words]; temp_v = new float[n_Words]; for (i = 0; i < n_Words; i++) v[i] = 0.0; for (i = 0; i < n_Docs; i++) p_Docs->ith_add_CV(i, v); average_vec(v, n_Words, n_Docs); for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i = 0; i < n_Clusters; i++) { for (j = 0; j < n_Words; j++) temp_v[j] = rand_gen.GetUniform() - 0.5; normalize_vec(temp_v, n_Words); temp_norm = Perturb * rand_gen.GetUniform(); for (j = 0; j < n_Words; j++) temp_v[j] *= temp_norm; for (j = 0; j < n_Words; j++) Concept_Vector[i][j] += v[j]*(1+ temp_v[j]); } delete [] v; delete [] temp_v; for (i = 0; i < n_Clusters; i++) norm_CV[i] = norm_2(Concept_Vector[i], n_Words); for (i = 0; i < n_Clusters; i++) p_Docs->euc_dis(Concept_Vector[i], norm_CV[i], Sim_Mat[i]); for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false); if(dumpswitch) { cout<<"Generated the centroid of whole data set then perturb around it"<<endl; }}/* first variations functions */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -