📄 kullback_leibler_k_means.cc
字号:
float temp, temp_norm; max_ind=0; max =0.0; temp_norm = norm_1(v, n_Words); k=0; for(i=0; i<n_Docs; i++) { while (i<empty_Docs_ID[k]) { temp = p_Docs->Kullback_leibler(v, i, laplacian, temp_norm); if ( temp > max ) { max = temp; max_ind = i; } i++; } k++; } cv[0] = max_ind; delete [] v; if(dumpswitch) { cout<<"Cluster centroids are chosen to be well separated from each other."<<endl; cout<<"Start with a vector farthest from the centroid of the whole data set"<<endl; } break; } p_Docs->ith_add_CV_prior(cv[0], Concept_Vector[0]); prior[0] = norm_1(Concept_Vector[0], n_Words); compute_sum_log(0); mark[cv[0]] = true; p_Docs->Kullback_leibler(Concept_Vector[0], Sim_Mat[0], laplacian, prior[0]); for (i=1; i<n_Clusters; i++) { max_ind = 0; max = 0; for (j=0; j<n_Docs; j++) { if(!mark[j]) { cos_sum = 0; for (k=0; k<i; k++) cos_sum += Sim_Mat[k][j]; if (cos_sum > max) { max = cos_sum; max_ind = j; } } } cv[i] = max_ind; p_Docs->ith_add_CV_prior(cv[i], Concept_Vector[i]); prior[i] = norm_1(Concept_Vector[i], n_Words); compute_sum_log(i); p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]); mark[cv[i]] = true; } for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false); if(dumpswitch) { cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] cv; delete [] mark;}void Kullback_leibler_k_means::random_centroids(Matrix *p_Docs){ int i, j, k, *cv = new int [n_Clusters]; bool *mark= new bool[n_Docs]; k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { mark[i] = false; i++; } mark[i]=true; k++; } switch (laplacian) { case CENTER_LAPLACE: case NOLAPLACE: for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; break; case PRIOR_LAPLACE: for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 1.0; break; } for (i=0; i<n_Clusters; i++) { do{ cv[i] = rand_gen.GetUniformInt(n_Docs); } while (mark[cv[i]]); mark[cv[i]] = true; p_Docs->ith_add_CV_prior(cv[i], Concept_Vector[i]); prior[i] = norm_1(Concept_Vector[i], n_Words); compute_sum_log(i); } if(dumpswitch) { cout<<"Randomly chose centroids"<<endl; cout<<"Vectors chosen to be the centroids are : "; for (i=0; i<n_Clusters; i++) cout<<cv[i]<<" "; cout<<endl; } delete [] mark; delete [] cv; for (i = 0; i < n_Clusters; i++) p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]); for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false);}void Kullback_leibler_k_means::random_perturb_init(Matrix *p_Docs){ VECTOR_double v, temp_v; int i, j; float temp_norm; v = new float[n_Words]; temp_v = new float[n_Words]; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: for (i = 0; i < n_Words; i++) v[i] = 0.0; break; case PRIOR_LAPLACE: for (i = 0; i < n_Words; i++) v[i] = n_Docs; break; } for (i = 0; i < n_Docs; i++) p_Docs->ith_add_CV_prior(i, v); normalize_vec_1(v, n_Words); for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = 0.0; for (i = 0; i < n_Clusters; i++) { for (j = 0; j < n_Words; j++) temp_v[j] = rand_gen.GetUniform() - 0.5; normalize_vec_1(temp_v, n_Words); temp_norm = Perturb * rand_gen.GetUniform(); for (j = 0; j < n_Words; j++) temp_v[j] *= temp_norm; for (j = 0; j < n_Words; j++) Concept_Vector[i][j] += fabs(v[j]*(1+ temp_v[j])); normalize_vec_1(Concept_Vector[i], n_Words); prior[i]=1.0/n_Clusters; compute_sum_log(i); } delete [] v; delete [] temp_v; for (i = 0; i < n_Clusters; i++) p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]); for(i=0; i<n_Docs; i++) Cluster[i] =0; assign_cluster(p_Docs, false); if(dumpswitch) { cout<<"Generated the centroid of whole data set then perturb around it"<<endl; }}/* first variations functions */float Kullback_leibler_k_means::K_L_first_variation(Matrix *p_Docs){ int i, j, max_change_index=-1; float *change= new float [f_v_times+i_k_means_times], *total_change =new float [f_v_times+i_k_means_times], max_change=0.0, pre_change; float *old_prior = new float[n_Clusters], *old_C_log_C = new float[n_Clusters]; one_step *track = new (one_step) [f_v_times+i_k_means_times]; bool *been_converted= new bool [n_Clusters]; int *original_Cluster =new int [f_v_times+i_k_means_times], *old_ClusterSize =new int [n_Clusters]; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) old_CV[i][j] = Concept_Vector[i][j]; for (i=0; i< n_Clusters; i++){ old_prior[i] = prior[i]; old_C_log_C[i] = C_log_C[i]; old_ClusterSize[i] = ClusterSize[i]; } cout<<endl<<"+ first variation +"<<endl; pre_change =0.0; float temp_result = Result; int act_f_v_times = f_v_times+i_k_means_times; clear_mark (); for (i=0; i< f_v_times || i<i_k_means_times; i++) { change[i] = one_f_v_move(p_Docs, track, i); if ( track[i].doc_ID <0) { cout <<"This current move occurs at a vector that's already chosen to move. So stop first variation."<<endl; act_f_v_times = i; break; } mark[i] = track[i].doc_ID; if ( dumpswitch) { cout<<"Vector "<<track[i].doc_ID<<" was moved from C"<<track[i].from_Cluster<<" to C"<<track[i].to_Cluster; if(n_Class>0) cout<<endl<<"And it is in category "<<label[track[i].doc_ID]<<endl; else cout<< endl; cout<<"Change of objective fun. of step "<<i+1<<" is : "<<change[i]<<endl; temp_result+=change[i]; } else cout<<"F"; original_Cluster[i] = Cluster[track[i].doc_ID]; Cluster[track[i].doc_ID] = track[i].to_Cluster; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: p_Docs->CV_sub_ith_prior(track[i].doc_ID, Concept_Vector[track[i].from_Cluster]); p_Docs->ith_add_CV_prior(track[i].doc_ID, Concept_Vector[track[i].to_Cluster]); break; case PRIOR_LAPLACE: for(j=0; j<n_Words; j++) Concept_Vector[track[i].from_Cluster][j] -= 1.0; p_Docs->CV_sub_ith(track[i].doc_ID, Concept_Vector[track[i].from_Cluster]); for(j=0; j<n_Words; j++) Concept_Vector[track[i].to_Cluster][j] += 1.0; p_Docs->ith_add_CV(track[i].doc_ID, Concept_Vector[track[i].to_Cluster]); break; } prior[track[i].from_Cluster] = norm_1(Concept_Vector[track[i].from_Cluster], n_Words); compute_sum_log(track[i].from_Cluster); prior[track[i].to_Cluster] = norm_1(Concept_Vector[track[i].to_Cluster], n_Words); compute_sum_log(track[i].to_Cluster); p_Docs->Kullback_leibler(Concept_Vector[track[i].from_Cluster], Sim_Mat[track[i].from_Cluster], laplacian, prior[track[i].from_Cluster]); p_Docs->Kullback_leibler(Concept_Vector[track[i].to_Cluster], Sim_Mat[track[i].to_Cluster], laplacian,prior[track[i].to_Cluster]); // notice cluster_quality[] is not updated because we don't need them as long as we have current centers total_change[i] = pre_change+change[i]; pre_change = total_change[i]; if (max_change > total_change[i]) { max_change = total_change[i]; max_change_index = i; } ClusterSize[track[i].from_Cluster] --; ClusterSize[track[i].to_Cluster] ++; if (f_v_times>0) { update_quality_change_mat(p_Docs, track[i].from_Cluster); update_quality_change_mat(p_Docs, track[i].to_Cluster); } } cout<<endl; if ( dumpswitch) if (max_change < fv_threshold) cout<<"Max change of objective fun. "<<max_change<<" occures at step "<<max_change_index+1<<endl; else cout<<"No change of objective fun."<<endl; for (i=0; i<n_Clusters; i++) been_converted[i] =false; if ( max_change >= fv_threshold) { max_change_index = -1; max_change =0.0; for (i=max_change_index+1; i<act_f_v_times; i++) Cluster[track[i].doc_ID] = original_Cluster[i] ; for (i=0; i< n_Clusters; i++) { ClusterSize[i] = old_ClusterSize[i]; for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = old_CV[i][j] ; prior[i] = old_prior[i]; C_log_C[i] = old_C_log_C[i]; p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian,prior[i]); if (f_v_times>0) update_quality_change_mat(p_Docs, i); } } else { if (max_change_index < act_f_v_times-1) { for (i=0; i<=max_change_index ;i++) { Cluster[track[i].doc_ID] = track[i].to_Cluster; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: p_Docs->ith_add_CV_prior(track[i].doc_ID, old_CV[track[i].to_Cluster]); p_Docs->CV_sub_ith_prior(track[i].doc_ID, old_CV[track[i].from_Cluster]); break; case PRIOR_LAPLACE: for(j=0; j<n_Words; j++) old_CV[track[i].to_Cluster][j] += 1.0; p_Docs->ith_add_CV(track[i].doc_ID, old_CV[track[i].to_Cluster]); for(j=0; j<n_Words; j++) old_CV[track[i].from_Cluster][j] -= 1.0; p_Docs->CV_sub_ith(track[i].doc_ID, old_CV[track[i].from_Cluster]); break; } been_converted[track[i].from_Cluster]=true; been_converted[track[i].to_Cluster]=true; old_ClusterSize[track[i].from_Cluster] --; old_ClusterSize[track[i].to_Cluster] ++; } for (i=max_change_index+1; i<act_f_v_times; i++) Cluster[track[i].doc_ID] = original_Cluster[i] ; for (i=0; i< n_Clusters; i++) { ClusterSize[i] = old_ClusterSize[i]; for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = old_CV[i][j] ; if (been_converted[i] == true) { prior[i] = norm_1(Concept_Vector[i], n_Words); compute_sum_log(i); } else { prior[i] = old_prior[i]; C_log_C[i] = old_C_log_C[i]; } p_Docs->Kullback_leibler(Concept_Vector[i], Sim_Mat[i], laplacian, prior[i]); } if (f_v_times>0) for (i=0; i<n_Clusters; i++) update_quality_change_mat(p_Docs, i); } if (dumpswitch) generate_Confusion_Matrix (label, n_Class); } //cout<<"!!"<<coherence(p_Docs, n_Clusters)<<endl; delete [] old_prior; delete [] old_C_log_C; delete [] change; delete [] total_change; delete [] been_converted; delete [] track; delete [] original_Cluster; delete [] old_ClusterSize; return max_change; }float Kullback_leibler_k_means::one_f_v_move(Matrix *p_Docs, one_step track [], int step){ int i, j, max_change_id=-1, where_to_go=-1; float max_diff, temp_diff;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -