📄 spherical_k_means.cc
字号:
/* Implementation of Sperical K-means Copyright (c) 2003, Yuqiang Guan*/#include <time.h>#include <assert.h>#include <iostream.h>#include <stdio.h>#include <fstream.h>#include <string.h>#include <math.h>#include "Spherical_k_means.h"/* Constructor */Spherical_k_means::Spherical_k_means(Matrix *p_docs, int cluster[], int ini_num_clusters, int est_start, float Omiga, bool random_seeding, int seed, float epsilon) : Gmeans(p_docs, cluster, ini_num_clusters, est_start, Omiga, random_seeding, seed, epsilon){ Sim_Mat = new VECTOR_double[n_Clusters]; for (int j = 0; j < n_Clusters; j++) Sim_Mat[j] = new float[n_Docs]; memory_consume+=n_Clusters*n_Docs*sizeof(float);}/* Spherical K-means functions */void Spherical_k_means::general_k_means(Matrix *p_Docs){ int n_Iters, i, j; if(dumpswitch) cout<<endl<<"- Start Spherical K-Means loop. -"<<endl<<endl; n_Iters =0; no_assignment_change =true; do { pre_Result = Result; n_Iters ++; if (n_Iters >EST_START) stablized =true; if(dumpswitch && stablized) cout <<"(Similarity estimation used.)"<<endl; if ( assign_cluster(p_Docs, stablized) == 0 ) { if (dumpswitch) cout<<"No points are moved in the last step "<<endl<<"@"<<endl<<endl; } else { compute_cluster_size(); no_assignment_change = false; if( stablized) for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) old_CV[i][j] = Concept_Vector[i][j]; update_centroids(p_Docs); if(stablized) { for (i = 0; i < n_Clusters; i++) { diff[i] = 0.0; for (j = 0; j < n_Words; j++) diff[i] += (old_CV[i][j] - Concept_Vector[i][j]) * (old_CV[i][j] - Concept_Vector[i][j]); diff[i] = sqrt(diff[i]); } for ( i=0; i<n_Docs; i++) Sim_Mat[Cluster[i]][i] = p_Docs->dot_mult (Concept_Vector[Cluster[i]], i); } else for (i = 0; i < n_Clusters; i++) p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); Result = coherence(n_Clusters); if(dumpswitch) { find_worst_vectors(false); cout<<"Obj. func. ( + Omiga * n_Clusters) = "<<Result<<endl<<"@"<<endl<<endl; if (verify) cout<<"Verify obj. func. : "<<verify_obj_func(p_Docs,n_Clusters)<<endl; } else cout<<"S"; } } while ((Result - pre_Result) > Epsilon*Result); cout<<endl; if (dumpswitch) { cout <<"Spherical K-Means loop stoped with "<<n_Iters<<" iterations."<<endl; generate_Confusion_Matrix (label, n_Class); } if (stablized) for (i = 0; i < n_Clusters; i++) p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); if ((!no_assignment_change) && (f_v_times >0)) for (i=0; i<n_Clusters; i++) update_quality_change_mat(p_Docs, i);}int Spherical_k_means::assign_cluster(Matrix *p_Docs, bool simi_est){ int i,j, k, multi=0, changed=0, temp_Cluster_ID; float temp_sim; k=0; if(simi_est) { for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Docs; j++) if (i != Cluster[j]) Sim_Mat[i][j] += diff[i]; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp_sim = Sim_Mat[Cluster[i]][i]; temp_Cluster_ID = Cluster[i]; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { if (Sim_Mat[j][i] > temp_sim) { multi++; Sim_Mat[j][i] = p_Docs->dot_mult(Concept_Vector[j], i); if (Sim_Mat[j][i] > temp_sim) { temp_sim = Sim_Mat[j][i]; temp_Cluster_ID = j; } } } if (temp_Cluster_ID != Cluster[i]) { Cluster[i] = temp_Cluster_ID; Sim_Mat[Cluster[i]][i] = temp_sim; changed++; } i++; } k++; } } else { for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp_sim = Sim_Mat[Cluster[i]][i]; temp_Cluster_ID = Cluster[i]; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { multi++; if (Sim_Mat[j][i]>temp_sim ) { temp_sim = Sim_Mat[j][i]; temp_Cluster_ID = j; } } if (temp_Cluster_ID != Cluster[i]) { Cluster[i] = temp_Cluster_ID; Sim_Mat[Cluster[i]][i] = temp_sim; changed++; } i++; } k++; } } if(dumpswitch) { cout << multi << " dot product computation\n"; cout << changed << " assignment changes\n"; } return changed;}void Spherical_k_means::update_centroids(Matrix *p_Docs){ int i, j, k; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { p_Docs->ith_add_CV(i, Concept_Vector[Cluster[i]]); i++; } k++; } for (i = 0; i < n_Clusters; i++) CV_norm[i]=normalize_vec(Concept_Vector[i], n_Words); }/* initialization functions */void Spherical_k_means::initialize_cv(Matrix *p_Docs, char * seeding_file){ int i, j; switch (init_clustering) { case RANDOM_PERTURB_INIT: random_perturb_init(p_Docs); break; case RANDOM_CLUSTER_ID_INIT: random_cluster_ID(); break; case CONCEPT_VECTORS_INIT: random_centroids(p_Docs); break; case WELL_SEPARATED_CENTROID_INIT: well_separated_centroids(p_Docs, 0); break; case WELL_SEPARATED_CENTROID_INIT_MODIFY: well_separated_centroids(p_Docs, 1); break; case SEEDINGFILE_INIT: seeding_file_init(p_Docs, seeding_file); break; case ALREADY_ASSIGNED: break; default: well_separated_centroids(p_Docs, 1); break; } // reset concept vectors for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i = 0; i < n_Docs; i++) { if((Cluster[i] >=0) && (Cluster[i] < n_Clusters)) //marked cluster[i]<0 points as seeds p_Docs->ith_add_CV(i, Concept_Vector[Cluster[i]]); else Cluster[i] =0; } compute_cluster_size(); for (i = 0; i < n_Clusters; i++) CV_norm[i]=normalize_vec(Concept_Vector[i], n_Words); for (i = 0; i < n_Clusters; i++) p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); //for (i = 0; i < n_Clusters; i++) //diff[i] = 0.0; // because we need give the coherence here. Result = coherence(n_Clusters); if(dumpswitch || evaluate) { outputClusterSize(); cout<<"Initial Obj. func.: "<<Result<<endl; if (n_Class >0) cout<<"Initial confusion matrix :"<<endl; generate_Confusion_Matrix (label, n_Class); } if(evaluate) { purity_Entropy_MutInfo(); F_measure(label, n_Class); micro_avg_precision_recall(); cout<<endl; cout <<"* Evaluation done. *"<<endl; exit (0); } if (f_v_times >0) { quality_change_mat = new (float *)[n_Clusters]; for (int j = 0; j < n_Clusters; j++) quality_change_mat [j] = new float[n_Docs]; memory_consume+=(n_Clusters*n_Docs)*sizeof(float); for (i = 0; i < n_Clusters; i++) update_quality_change_mat (p_Docs, i); } memory_consume+=p_Docs->GetMemoryUsed();}void Spherical_k_means::well_separated_centroids(Matrix *p_Docs, int choice){ int i, j, k, min_ind, *cv = new int [n_Clusters]; float min, cos_sum; bool *mark = new bool [n_Docs]; for (i=0; i< n_Docs; i++) mark[i] = false; for (i=0; i< n_Empty_Docs; i++) mark[empty_Docs_ID[i]] = true; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; switch (choice) { case 0: do{ cv[0] = rand_gen.GetUniformInt(n_Docs); }while (mark[cv[0]]); if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a random chosen vector"<<endl; } break; case 1: default: float *v, min; int min_ID=0; v = new float[n_Words]; for (i = 0; i < n_Words; i++) v[i] = 0.0; for (i = 0; i < n_Docs; i++) p_Docs->ith_add_CV(i, v); float temp; k=0; normalize_vec(v, n_Words); min =1.0; min_ID =0; for(i=0; i<n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp = p_Docs->dot_mult(v, i); if ( temp < min ) { min = temp; min_ID = i; } i++; } k++; } cv[0] = min_ID; delete [] v; if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a vector farthest from the centroid of the whole data set"<<endl; } break; } p_Docs->ith_add_CV(cv[0], Concept_Vector[0]); mark[cv[0]] = true; p_Docs->trans_mult(Concept_Vector[0], Sim_Mat[0]); for (i=1; i<n_Clusters; i++) { min_ind = 0; min = 2.0 * n_Clusters; for (j=0; j<n_Docs; j++) { if(!mark[j]) { cos_sum = 0; for (k=0; k<i; k++) cos_sum += Sim_Mat[k][j]; if (cos_sum < min) { min = cos_sum; min_ind = j; } } } cv[i] = min_ind; p_Docs->ith_add_CV(cv[i], Concept_Vector[i]); p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); mark[cv[i]] = true; } for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false); if(dumpswitch) { cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] cv; delete [] mark;}void Spherical_k_means::random_centroids(Matrix *p_Docs){ int i, j, *cv = new int [n_Clusters]; bool *mark= new bool[n_Docs]; for (i=0; i< n_Docs; i++) mark[i] = false; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i=0; i<n_Clusters; i++) { do{ cv[i] = rand_gen.GetUniformInt(n_Docs); } while (mark[cv[i]]); mark[cv[i]] = true; p_Docs->ith_add_CV(cv[i], Concept_Vector[i]); } if(dumpswitch) { cout<<"Randomly chose centroids"<<endl; cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] mark; delete [] cv; for (i = 0; i < n_Clusters; i++) p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false);}void Spherical_k_means::random_perturb_init(Matrix *p_Docs){ VECTOR_double v, temp_v; int i, j; float temp_norm; v = new float[n_Words]; temp_v = new float[n_Words]; for (i = 0; i < n_Words; i++) v[i] = 0.0; for (i = 0; i < n_Docs; i++) p_Docs->ith_add_CV(i, v); normalize_vec(v, n_Words); for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i = 0; i < n_Clusters; i++) { for (j = 0; j < n_Words; j++) temp_v[j] = rand_gen.GetUniform() - 0.5; normalize_vec(temp_v, n_Words); temp_norm = Perturb * rand_gen.GetUniform(); for (j = 0; j < n_Words; j++) temp_v[j] *= temp_norm; for (j = 0; j < n_Words; j++) Concept_Vector[i][j] += fabs(v[j]*(1+ temp_v[j])); } delete [] v; delete [] temp_v; for (i = 0; i < n_Clusters; i++) CV_norm[i] = normalize_vec(Concept_Vector[i], n_Words); for (i = 0; i < n_Clusters; i++) p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false); if(dumpswitch) { cout<<"Generated the centroid of whole data set then perturb around it"<<endl; }}/* first variations functions */float Spherical_k_means::K_L_first_variation(Matrix *p_Docs) /* we are gonna use sum vector instead of concept vector in FV */{ int i, j, max_change_index=-1; float *change= new float [f_v_times], *total_change =new float [f_v_times], max_change=0.0, pre_change; float *old_CV_norm = new float[n_Clusters]; one_step *track = new (one_step) [f_v_times]; bool *been_converted = new bool [n_Clusters]; int *original_Cluster =new int [f_v_times], *old_ClusterSize =new int [n_Clusters];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -