📄 spherical_k_means.cc
字号:
for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) old_CV[i][j] = Concept_Vector[i][j]; for (i=0; i< n_Clusters; i++) old_CV_norm[i] = CV_norm[i]; for (i=0; i< n_Clusters; i++) old_ClusterSize[i] = ClusterSize[i]; /* for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Words; j++) Concept_Vector[i][j] *= CV_norm[i]; for (i = 0; i < n_Clusters; i++) for (j = 0; j < n_Docs; j++) Sim_Mat[i][j] *= CV_norm[i]; for (i=0; i<n_Clusters; i++) update_quality_change_mat(p_Docs, i); */ cout<<endl<<"+ first variation +"<<endl; pre_change =0.0; float temp_result = Result; int act_f_v_times = f_v_times; for (i=0; i< f_v_times; i++) { change[i] = one_f_v_move(p_Docs, track, i); if ( track[i].doc_ID <0) { cout <<"This current move occurs at a vector that's already chosen to move. So stop first variation."<<endl; act_f_v_times = i; break; } mark[i] = track[i].doc_ID; if ( dumpswitch) { cout<<"Vector "<<track[i].doc_ID<<" was moved from C"<<track[i].from_Cluster<<" to C"<<track[i].to_Cluster; if(n_Class>0) cout<<endl<<"And it is in category "<<label[track[i].doc_ID]<<endl; else cout<< endl; cout<<"Change of objective fun. of step "<<i+1<<" is : "<<change[i]<<endl; temp_result+=change[i]; } else cout<<"F"; original_Cluster[i] = Cluster[track[i].doc_ID]; Cluster[track[i].doc_ID] = track[i].to_Cluster; for (j=0; j<n_Words; j++) Concept_Vector[track[i].from_Cluster][j] *= CV_norm[track[i].from_Cluster]; p_Docs->CV_sub_ith(track[i].doc_ID, Concept_Vector[track[i].from_Cluster]); for (j=0; j<n_Words; j++) Concept_Vector[track[i].to_Cluster][j] *= CV_norm[track[i].to_Cluster]; p_Docs->ith_add_CV(track[i].doc_ID, Concept_Vector[track[i].to_Cluster]); CV_norm[track[i].from_Cluster]=normalize_vec(Concept_Vector[track[i].from_Cluster], n_Words); CV_norm[track[i].to_Cluster]=normalize_vec(Concept_Vector[track[i].to_Cluster], n_Words); /* CV_norm[track[i].from_Cluster] += 1.0-2.0*Sim_Mat[track[i].from_Cluster][track[i].doc_ID]; CV_norm[track[i].to_Cluster] += 1.0+2.0*Sim_Mat[track[i].to_Cluster][track[i].doc_ID]; */ /* do I need to do this every time? Yes because we use Sim_Mat to compute delta_X */ p_Docs->trans_mult(Concept_Vector[track[i].from_Cluster], Sim_Mat[track[i].from_Cluster]); p_Docs->trans_mult(Concept_Vector[track[i].to_Cluster], Sim_Mat[track[i].to_Cluster]); /* for (j=0; j<n_Docs; j++) { Sim_Mat[track[i].from_Cluster][j] -= 1.0; Sim_Mat[track[i].to_Cluster][j] += 1.0; } */ total_change[i] = pre_change+change[i]; pre_change = total_change[i]; if (max_change <total_change[i]) { max_change = total_change[i]; max_change_index = i; } ClusterSize[track[i].from_Cluster] --; ClusterSize[track[i].to_Cluster] ++; update_quality_change_mat(p_Docs, track[i].from_Cluster); update_quality_change_mat(p_Docs, track[i].to_Cluster); } cout<<endl; if ( dumpswitch) if (max_change > s_bar) cout<<"Max change of objective fun. "<<max_change<<" occures at step "<<max_change_index+1<<endl; else cout<<"No change of objective fun."<<endl; for (i=0; i< n_Clusters; i++) been_converted[i] = false; if (max_change <=s_bar) { max_change_index= -1; max_change =0.0; for (i=max_change_index+1; i<act_f_v_times; i++) Cluster[track[i].doc_ID] = original_Cluster[i] ; for (i=0; i< n_Clusters; i++) { CV_norm[i] = old_CV_norm[i]; ClusterSize[i] = old_ClusterSize[i]; for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = old_CV[i][j] ; p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); diff[i]= 0.0; update_quality_change_mat(p_Docs, i); } } else { if (max_change_index < act_f_v_times-1) { for (i=0; i<=max_change_index ;i++) { Cluster[track[i].doc_ID] = track[i].to_Cluster; if(been_converted[track[i].from_Cluster] ==false) { for (j=0; j<n_Words; j++) old_CV[track[i].from_Cluster][j] *= old_CV_norm[track[i].from_Cluster]; been_converted[track[i].from_Cluster] = true; } if(been_converted[track[i].to_Cluster] ==false) { for (j=0; j<n_Words; j++) old_CV[track[i].to_Cluster][j] *= old_CV_norm[track[i].to_Cluster]; been_converted[track[i].to_Cluster] = true; } p_Docs->CV_sub_ith(track[i].doc_ID, old_CV[track[i].from_Cluster]); p_Docs->ith_add_CV(track[i].doc_ID, old_CV[track[i].to_Cluster]); old_ClusterSize[track[i].from_Cluster] --; old_ClusterSize[track[i].to_Cluster] ++; } for (i=max_change_index+1; i<act_f_v_times; i++) Cluster[track[i].doc_ID] = original_Cluster[i] ; for (i=0; i< n_Clusters; i++) { if (been_converted[i] == true) CV_norm[i] = normalize_vec(old_CV[i], n_Words); else CV_norm[i] = old_CV_norm[i]; ClusterSize[i] = old_ClusterSize[i]; for (j = 0; j < n_Words; j++) Concept_Vector[i][j] = old_CV[i][j] ; p_Docs->trans_mult(Concept_Vector[i], Sim_Mat[i]); diff[i]= 0.0; } for (i=0; i<n_Clusters; i++) update_quality_change_mat(p_Docs, i); } else // max_change_index == act_f_v_times { for (i=0; i< n_Clusters; i++) { for (j = 0; j < n_Words; j++) { //Concept_Vector[i][j] /= CV_norm[i]; old_CV[i][j] = Concept_Vector[i][j]; } /* for (j = 0; j < n_Docs; j++) Sim_Mat[i][j] /= CV_norm[i]; */ diff[0] =0; } } if (dumpswitch) generate_Confusion_Matrix (label, n_Class); } //cout<<"!!"<<coherence(p_Docs, n_Clusters)<<endl; delete [] change; delete [] total_change; delete [] old_CV_norm; delete [] been_converted; delete [] track; delete [] original_Cluster; return max_change; }float Spherical_k_means::one_f_v_move(Matrix *p_Docs, one_step track [], int step){ int i, j, max_change_id=-1, where_to_go=-1; float max_diff=-1.0e10, temp_diff; int k; k=0; max_diff=-1.0*n_Docs; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { if (!already_marked(i)) // a document can only be moved in one first-variation in the whole clustering for (j = 0; j < n_Clusters; j++) { if (j != Cluster[i]) { //temp_diff = delta_X ( p_Docs, i, j); temp_diff = quality_change_mat[j][i]; if (temp_diff > max_diff) { max_change_id= i; where_to_go = j; max_diff = temp_diff; } } } i++; } k++; } track[step].doc_ID = max_change_id; track[step].from_Cluster = Cluster[max_change_id]; track[step].to_Cluster = where_to_go; return max_diff;}float Spherical_k_means::delta_X ( Matrix *p_Docs, int x, int c_ID){ float quality_change =0.0; if ((Cluster[x] == c_ID) || (ClusterSize[Cluster[x]] ==1)) return 0; if (Cluster[x] >=0) quality_change = sqrt(CV_norm[Cluster[x]]*CV_norm[Cluster[x]]-2*CV_norm[Cluster[x]]*Sim_Mat[Cluster[x]][x]+1)-CV_norm[Cluster[x]]; if (c_ID >=0) quality_change += sqrt(CV_norm[c_ID]*CV_norm[c_ID]+2*CV_norm[c_ID]*Sim_Mat[c_ID][x]+1)-CV_norm[c_ID]; /* quality_change=sqrt(CV_norm[Cluster[x]]*CV_norm[Cluster[x]]-2*dot1+1)-CV_norm[Cluster[x]]+sqrt(CV_norm[c_ID]*CV_norm[c_ID]+2*dot2+1)-CV_norm[c_ID];*/ return quality_change;}/* split functions */float Spherical_k_means::Split_Clusters(Matrix *p_Docs, int worst_vector, float threshold){ int worst_vector_s_cluster_ID, i, j; float change; if(dumpswitch) { cout <<endl<<"- Split the clusters -"<<endl; cout <<"The worst vector "<<worst_vector<<" is in cluster "<<Cluster[worst_vector]; if(n_Class>0) cout<<" and in category "<<label[worst_vector]<<endl; else cout<< endl; } worst_vector_s_cluster_ID = Cluster[worst_vector]; change = delta_X(p_Docs, worst_vector, -1) +1.0+ Omiga; if(change > threshold) { // reform the cluster controids. Cluster[worst_vector] = n_Clusters; n_Clusters ++; ClusterSize[worst_vector_s_cluster_ID] --; new_Concept_Vector = new (float*) [n_Clusters]; for (i = 0; i <n_Clusters; i++) new_Concept_Vector[i] = new float [n_Words]; new_CV_norm = new float [n_Clusters]; new_cluster_quality = new_CV_norm; for (i = 0; i < n_Clusters-1; i++) { for (j = 0; j < n_Words; j++) new_Concept_Vector[i][j] = Concept_Vector[i][j]; new_CV_norm[i] = CV_norm[i]; } for (j = 0; j < n_Words; j++) new_Concept_Vector[n_Clusters-1][j]=0.0; p_Docs->ith_add_CV(worst_vector, new_Concept_Vector[n_Clusters-1]); new_cluster_quality[n_Clusters-1] = 1.0; for (j=0; j<n_Words; j++) new_Concept_Vector[worst_vector_s_cluster_ID][j] = Concept_Vector[worst_vector_s_cluster_ID][j]*CV_norm[worst_vector_s_cluster_ID]; p_Docs->CV_sub_ith(worst_vector, new_Concept_Vector[worst_vector_s_cluster_ID]); new_CV_norm[worst_vector_s_cluster_ID]= normalize_vec(new_Concept_Vector[worst_vector_s_cluster_ID], n_Words); for (i=0; i<n_Clusters-1; i++) delete [] old_CV[i]; delete [] old_CV; old_CV = new VECTOR_double[n_Clusters]; for (i = 0; i < n_Clusters; i++) old_CV[i] = new float[n_Words]; for (i = 0; i < n_Clusters-1; i++) delete[] Concept_Vector[i]; delete [] Concept_Vector; Concept_Vector=new_Concept_Vector; new_diff =new float [n_Clusters]; for (i=0; i<n_Clusters-1; i++) new_diff[i]= diff[i]; new_diff[n_Clusters-1]=0.0; delete [] diff; diff = new_diff; new_Sim_Mat = new (float *)[n_Clusters]; for ( j = 0; j < n_Clusters; j++) new_Sim_Mat[j] = new float[n_Docs]; for (i=0; i<n_Clusters-1; i++) for (j=0; j<n_Docs; j++) new_Sim_Mat[i][j] = Sim_Mat[i][j] ; p_Docs->trans_mult(Concept_Vector[n_Clusters-1], new_Sim_Mat[n_Clusters-1]); p_Docs->trans_mult(Concept_Vector[worst_vector_s_cluster_ID], new_Sim_Mat[worst_vector_s_cluster_ID]); for (i=0; i<n_Clusters-1; i++) delete [] Sim_Mat[i]; /* there is some problem in linux about this following deletion */ delete [] Sim_Mat; Sim_Mat = new_Sim_Mat; delete [] cluster_quality; CV_norm = cluster_quality = new_cluster_quality; new_ClusterSize = new int [n_Clusters]; for (i=0; i<n_Clusters-1; i++) new_ClusterSize[i] = ClusterSize[i]; new_ClusterSize[n_Clusters-1]=1; delete [] ClusterSize; ClusterSize = new_ClusterSize; if (dumpswitch) generate_Confusion_Matrix (label, n_Class); if ( f_v_times > 0) { new_quality_change_mat = new (float *)[n_Clusters]; for ( j = 0; j < n_Clusters; j++) new_quality_change_mat [j] = new float[n_Docs]; update_quality_change_mat (p_Docs, worst_vector_s_cluster_ID); for (i=0; i<n_Clusters-1; i++) for (j=0; j<n_Docs; j++) new_quality_change_mat [i][j] = quality_change_mat [i][j]; for (i=0; i<n_Clusters-1; i++) delete [] quality_change_mat[i]; delete []quality_change_mat; quality_change_mat = new_quality_change_mat; update_quality_change_mat (p_Docs, n_Clusters-1); } } else cout<<endl<<"This splitting doesn't benefit objective function. No splitting."<<endl; return change;}/* tool functions */void Spherical_k_means::remove_Empty_Clusters() /* remove empty clusters after general k-means and fv because fv gets rid of empty clusters*/{ int *cluster_label; int i,j, tmp_label; cluster_label = new int [n_Clusters]; tmp_label =0; empty_Clusters=0; for(i=0; i<n_Clusters; i++) { if(ClusterSize[i] == 0) { empty_Clusters ++; } else { cluster_label[i]= tmp_label; tmp_label++; } } if(empty_Clusters !=0) { cout<<empty_Clusters<<" empty clusters generated."<<endl<<"Cluster IDs have been changed."<<endl; int k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { Cluster[i] = cluster_label[Cluster[i]]; i++; } k++; } for (i=0; i< n_Clusters; i++) if ((ClusterSize[i] != 0) && (cluster_label[i] != i)) { for (j=0; j<n_Words; j++) Concept_Vector[cluster_label[i]][j] = Concept_Vector[i][j]; for (j=0; j<n_Docs; j++) Sim_Mat[cluster_label[i]][j] = Sim_Mat[i][j]; diff[cluster_label[i]] = diff[i]; ClusterSize[cluster_label[i]] = ClusterSize[i]; cluster_quality[cluster_label[i]]=cluster_quality[i]; } for(i= n_Clusters- empty_Clusters ; i< n_Clusters; i++) { delete [] Concept_Vector[i]; delete [] old_CV[i]; delete [] Sim_Mat[i]; } n_Clusters -=empty_Clusters; } delete [] cluster_label;}float Spherical_k_means::verify_obj_func(Matrix *p_Docs, int n_clus){ int i; float value =0.0; for (i = 0; i < n_Docs; i++) value += p_Docs->dot_mult(Concept_Vector[Cluster[i]], i); return value+n_clus*Omiga;}int Spherical_k_means::find_worst_vectors(bool dumped){ int i, k, worst_vector=0; float min_sim=1.0, *worst_vec_per_cluster; worst_vec_per_cluster = new float[n_Clusters]; min_sim = 1.0; for (i = 0; i <n_Clusters; i++) worst_vec_per_cluster[i] = 1.0; k=0; for (i=0; i<n_Docs; i++) { while (i<empty_Docs_ID[k]) { if ( Sim_Mat[Cluster[i]][i] < worst_vec_per_cluster[Cluster[i]]) worst_vec_per_cluster[Cluster[i]] =Sim_Mat[Cluster[i]][i]; if ( Sim_Mat[Cluster[i]][i] < min_sim) { worst_vector =i; min_sim = Sim_Mat[Cluster[i]][i] ; } i++; } k++; } s_bar = 0.0; for (i = 0; i <n_Clusters; i++) { if (ClusterSize[i]==0) { if(dumpswitch) cout <<"Cluster "<<i<<" is empty."<<endl; } else { float temp; if(dumpswitch && !dumped) { temp=cluster_quality[i]*cluster_quality[i]-2*cluster_quality[i]*worst_vec_per_cluster[i]+1; temp=sqrt(temp); cout<<"#"<<i<<" : quality/# doc/av_quality/Omega_1/Worst dp = "; cout<<cluster_quality[i]<<"/"<<ClusterSize[i]<<"/"<<cluster_quality[i]/ClusterSize[i]; cout<<"/"<<cluster_quality[i]-1.0-temp; cout<<"/"<<worst_vec_per_cluster[i]<<endl; } if(s_bar <cluster_quality[i]) s_bar = cluster_quality[i]; } } s_bar = 1.0/s_bar; if(dumpswitch && !dumped) { cout<<"Vector "<<worst_vector<< " has the worst dot product "<<min_sim<<endl; } delete [] worst_vec_per_cluster; return worst_vector;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -