📄 diametric_k_means.cc
字号:
/* Implementation of Diametric K-means Copyright (c) 2002, Yuqiang Guan*/#include <time.h>#include <assert.h>#include <iostream.h>#include <stdio.h>#include <fstream.h>#include <string.h>#include <math.h>#include "Diametric_k_means.h"/* Constructor and Distructor */Diametric_k_means::Diametric_k_means(Matrix *p_docs, int cluster[], int ini_num_clusters, int est_start, float Omiga, bool random_seeding, int seed, float epsilon) : Gmeans(p_docs, cluster, ini_num_clusters, est_start, Omiga, random_seeding, seed, epsilon){ Sim_Mat = new VECTOR_double[n_Clusters]; for (int j = 0; j < n_Clusters; j++) Sim_Mat[j] = new float[n_Docs]; memory_consume+=n_Clusters*n_Docs*sizeof(float); addition = new float [ini_num_clusters];}Diametric_k_means::~Diametric_k_means(){ delete [] addition;}/* Diametric K-means functions */void Diametric_k_means::general_k_means(Matrix *p_Docs){ int n_Iters, i, j; bool no_assignment_change; if(dumpswitch) cout<<endl<<"- Start General K-Means loop. -"<<endl<<endl; n_Iters =0; no_assignment_change =true; do { pre_Result = Result; n_Iters ++; if (n_Iters >EST_START) stablized =true; if(dumpswitch && stablized) cout <<"(Similarity estimation used.)"<<endl; if ( assign_cluster(p_Docs, stablized) == 0 ) { if (dumpswitch) cout<<"No points are moved in the last step "<<endl<<"@"<<endl<<endl; } else { compute_cluster_size(); if( stablized) for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) old_CV[i][j] = Concept_Vector[i][j]; update_centroids(p_Docs); if(stablized) { for (i = 0; i < n_Clusters; i++) { diff[i] = 0.0; addition[i] =0.0; for (j = 0; j < n_Words; j++) { diff[i] += (old_CV[i][j] - Concept_Vector[i][j]) * (old_CV[i][j] - Concept_Vector[i][j]); addition[i] += (old_CV[i][j] + Concept_Vector[i][j]) * (old_CV[i][j] + Concept_Vector[i][j]); } diff[i] = sqrt(diff[i]); addition[i] = sqrt (addition[i]); diff[i] *= addition[i]; } for ( i=0; i<n_Docs; i++) Sim_Mat[Cluster[i]][i] = p_Docs->squared_dot_mult (Concept_Vector[Cluster[i]], i); } else for (i = 0; i < n_Clusters; i++) p_Docs->squared_trans_mult(Concept_Vector[i], Sim_Mat[i]); Result = coherence(n_Clusters); if(dumpswitch) { find_worst_vectors(false); cout<<"Obj. func. ( + Omiga * n_Clusters) = "<<Result<<endl<<"@"<<endl<<endl; } else cout<<"D"; } } while ((Result - pre_Result) > Epsilon*Result); cout<<endl; if (dumpswitch) { cout <<"Diametric K-Means loop stoped with "<<n_Iters<<" iterations."<<endl; generate_Confusion_Matrix (label, n_Class); } if (stablized) for (i = 0; i < n_Clusters; i++) p_Docs->squared_trans_mult(Concept_Vector[i], Sim_Mat[i]); if ((!no_assignment_change) && (f_v_times >0)) for (i=0; i<n_Clusters; i++) update_quality_change_mat(p_Docs, i);}int Diametric_k_means::assign_cluster(Matrix *p_Docs, bool simi_est){ int i,j, k, multi=0, changed=0, temp_Cluster_ID; float temp_sim; k=0; if(simi_est) { for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Docs; j++) if (i != Cluster[j]) Sim_Mat[i][j] += diff[i]; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp_sim = Sim_Mat[Cluster[i]][i]; temp_Cluster_ID = Cluster[i]; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { if (Sim_Mat[j][i] > temp_sim) { multi++; Sim_Mat[j][i] = p_Docs->squared_dot_mult(Concept_Vector[j], i); if (Sim_Mat[j][i] > temp_sim) { temp_sim = Sim_Mat[j][i]; temp_Cluster_ID = j; } } } if (temp_Cluster_ID != Cluster[i]) { Cluster[i] = temp_Cluster_ID; Sim_Mat[Cluster[i]][i] = temp_sim; changed++; } i++; } k++; } } else { for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp_sim = Sim_Mat[Cluster[i]][i]; temp_Cluster_ID = Cluster[i]; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { multi++; if (Sim_Mat[j][i]>temp_sim ) { temp_sim = Sim_Mat[j][i]; temp_Cluster_ID = j; } } if (temp_Cluster_ID != Cluster[i]) { Cluster[i] = temp_Cluster_ID; Sim_Mat[Cluster[i]][i] = temp_sim; changed++; } i++; } k++; } } if(dumpswitch) { cout << multi << " dot product computation\n"; cout << changed << " assignment changes\n"; } return changed;}void Diametric_k_means::update_centroids(Matrix *p_Docs){ p_Docs->right_dom_SV(Cluster, ClusterSize, n_Clusters, Concept_Vector, cluster_quality, -1);}/* initialization functions */void Diametric_k_means::Diametric_seeding_file_init(Matrix *p_Docs, char * seeding_file){ std::ifstream gpfile(seeding_file); int i, max_class=0; char whole_line[256]; if(gpfile.is_open()) { cout<<"Initial cluster ID file: "<<seeding_file<<endl; gpfile>>i; gpfile.getline(whole_line, 256, '\n'); if(i!=n_Docs) { cout<<"Wrong number of vectors!!! \nSo using well-separate-centroid initialization ..."<<endl; well_separated_centroids(p_Docs, 1); } else { for (i=0; i<n_Docs; i++) { gpfile>>Cluster[i]; Cluster[i] /=2; if (max_class<Cluster[i]) max_class=Cluster[i]; gpfile.getline(whole_line, 256, '\n'); } if (max_class != n_Clusters-1) { cout<<"Number of clusters in the seeding file is not correct!!!\nSo using well-separate-centroid initialization ..."<<endl; well_separated_centroids(p_Docs, 1); } else if(dumpswitch) { cout<<"Initial centroids are generated from a seeding file."<<endl; } } gpfile.close(); } else { if (!evaluate) { cout<<"Can't open "<<seeding_file<<endl<<"So using well-separate-centroid initialization ..."<<endl; well_separated_centroids(p_Docs, 1); } else { cout<<"Can't open "<<seeding_file<<" or wrong file format."<<endl; exit (0); } }}void Diametric_k_means::initialize_cv(Matrix *p_Docs, char * seeding_file){ int i; switch (init_clustering) { case RANDOM_PERTURB_INIT: random_perturb_init(p_Docs); break; case RANDOM_CLUSTER_ID_INIT: random_cluster_ID(); for (i=0; i<n_Clusters; i++) //make sure Concept_Vector is not zero Concept_Vector[i][0] =1.0; break; case CONCEPT_VECTORS_INIT: random_centroids(p_Docs); break; case WELL_SEPARATED_CENTROID_INIT: well_separated_centroids(p_Docs, 0); break; case WELL_SEPARATED_CENTROID_INIT_MODIFY: well_separated_centroids(p_Docs, 1); break; case SEEDINGFILE_INIT: if (evaluate) Diametric_seeding_file_init(p_Docs, seeding_file); else seeding_file_init(p_Docs, seeding_file); for (i=0; i<n_Clusters; i++) for(int j=0; j<n_Words; j++) Concept_Vector[i][j] =1.0; break; case ALREADY_ASSIGNED: break; default: well_separated_centroids(p_Docs, 1); break; } // reset concept vectors for (i = 0; i < n_Docs; i++) { if((Cluster[i] <0) || (Cluster[i] > n_Clusters)) //marked cluster[i]<0 points as seeds Cluster[i] =0; } compute_cluster_size(); p_Docs->right_dom_SV(Cluster, ClusterSize, n_Clusters, Concept_Vector, cluster_quality, -1); for (i = 0; i < n_Clusters; i++) p_Docs->squared_trans_mult(Concept_Vector[i], Sim_Mat[i]); for (i = 0; i < n_Clusters; i++) diff[i] = 0.0; // because we need give the coherence here. Result = coherence(n_Clusters); if(dumpswitch || evaluate) { outputClusterSize(); cout<<"Initial Obj. func.: "<<Result<<endl; if (n_Class >0) cout<<"Initial confusion matrix :"<<endl; generate_Confusion_Matrix (label, n_Class); } if(evaluate) { purity_Entropy_MutInfo(); F_measure(label, n_Class); micro_avg_precision_recall(); cout<<endl; cout <<"* Evaluation done. *"<<endl; exit (0); } if (f_v_times >0) { quality_change_mat = new (float *)[n_Clusters]; for (int j = 0; j < n_Clusters; j++) quality_change_mat [j] = new float[n_Docs]; memory_consume+=(n_Clusters*n_Docs)*sizeof(float); for (i = 0; i < n_Clusters; i++) update_quality_change_mat (p_Docs, i); } memory_consume+=p_Docs->GetMemoryUsed();}void Diametric_k_means::well_separated_centroids(Matrix *p_Docs, int choice){ int i, j, k, min_ind, *cv = new int [n_Clusters]; float min, cos_sum; bool *mark = new bool [n_Docs]; float **vp, cq[1]; int cs[1]; for (i=0; i< n_Docs; i++) mark[i] = false; for (i=0; i< n_Empty_Docs; i++) mark[empty_Docs_ID[i]] = true; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; switch (choice) { case 0: do{ cv[0] = rand_gen.GetUniformInt(n_Docs); } while(mark[cv[0]]); if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a random chosen vector"<<endl; } break; case 1: default: float *v, min; int min_ID=0; v = new float[n_Words]; float temp; k=0; vp = &(v); v[0] =1.0; cs[0] = n_Docs; for (i=0; i<n_Docs; i++) Cluster[i] =0; p_Docs->right_dom_SV(Cluster, cs, 1, vp, cq, -1); //for (int l=0; l<n_Words; l++) //cout<<v[l]<<" "; //cout<<endl; min =1.0; min_ID =0; for(i=0; i<n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp = p_Docs->squared_dot_mult(v, i); if ( temp < min ) { min = temp; min_ID = i; } i++; } k++; } cv[0] = min_ID; delete [] v; if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a vector farthest from the centroid of the whole data set"<<endl; } break; } p_Docs->ith_add_CV(cv[0], Concept_Vector[0]); mark[cv[0]] = true; p_Docs->squared_trans_mult(Concept_Vector[0], Sim_Mat[0]); for (i=1; i<n_Clusters; i++) { min_ind = 0; min = 2.0 * n_Clusters; for (j=0; j<n_Docs; j++) { if(!mark[j]) { cos_sum = 0; for (k=0; k<i; k++) cos_sum += Sim_Mat[k][j]; if (cos_sum < min) { min = cos_sum; min_ind = j; } } } cv[i] = min_ind; p_Docs->ith_add_CV(cv[i], Concept_Vector[i]); p_Docs->squared_trans_mult(Concept_Vector[i], Sim_Mat[i]); mark[cv[i]] = true; } for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false); if(dumpswitch) { cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] cv; delete [] mark;}void Diametric_k_means::random_centroids(Matrix *p_Docs){ int i, j, *cv = new int [n_Clusters]; bool *mark= new bool[n_Docs]; for (i=0; i< n_Docs; i++) mark[i] = false; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i=0; i<n_Clusters; i++) { do{ cv[i] = rand_gen.GetUniformInt(n_Docs); } while (mark[cv[i]]); mark[cv[i]] = true; p_Docs->ith_add_CV(cv[i], Concept_Vector[i]); } if(dumpswitch) { cout<<"Randomly chose centroids"<<endl; cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] mark; delete [] cv; for (i = 0; i < n_Clusters; i++) p_Docs->squared_trans_mult(Concept_Vector[i], Sim_Mat[i]); for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false);}void Diametric_k_means::random_perturb_init(Matrix *p_Docs){ VECTOR_double v, temp_v; int i, j; float temp_norm; float **vp, cq[1]; int cs[1]; v = new float[n_Words]; temp_v = new float[n_Words]; vp = &(v); cs[0] = n_Docs; for (i=0; i<n_Docs; i++) Cluster[i] =0; p_Docs->right_dom_SV(Cluster, cs, 1, vp, cq, -1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -