📄 kullback_leibler_k_means.cc
字号:
int k; /* for(i=0;i<n_Clusters;i++) { for (j=0;j<n_Docs; j++) cout<<quality_change_mat[i][j]<<" "; cout<<endl; } for (i=0; i<n_Docs; i++) cout<<p_Docs->getPrior(i)<<endl; for (i=0; i<n_Docs; i++) cout<<p_Docs->GetNorm(i)<<endl; */ k=0; max_diff= Result; if(f_v_times>0) { for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { if (!already_marked(i)) //a document can only be moved in one first-variation in the whole clustering for (j = 0; j < n_Clusters; j++) { if (j != Cluster[i]) { //temp_diff = delta_X ( p_Docs, i, j); temp_diff = quality_change_mat[j][i]; if (temp_diff < max_diff) { max_change_id= i; where_to_go = j; max_diff = temp_diff; } } } i++; } k++; } } else //if(i_k_means_times >0) { do i= rand_gen.GetUniformInt(n_Docs); while(already_marked(i) || empty_vector(i)); max_change_id=i; for (j = 0; j < n_Clusters; j++) if (j != Cluster[i]) { temp_diff = delta_X ( p_Docs, i, j); if (temp_diff < max_diff) { where_to_go = j; max_diff = temp_diff; } } } track[step].doc_ID = max_change_id; track[step].from_Cluster = Cluster[max_change_id]; track[step].to_Cluster = where_to_go; return max_diff;}float Kullback_leibler_k_means::delta_X ( Matrix *p_Docs, int x, int c_ID) //if Cluster[x] <0, we compute quality change of cluster c_ID due to adding x into it; //if c_ID <0, we compute quality change of cluster Cluster[x] due to deleting x from it.{ float quality_change=0.0, *temp, temp_norm, temp_sum_log; if ((Cluster[x] == c_ID) || (ClusterSize[Cluster[x]] ==1)) return 0; temp = new float [n_Words]; if (Cluster[x]>=0){ for (int j=0; j<n_Words; j++) temp [j] = Concept_Vector[Cluster[x]][j]; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: p_Docs->CV_sub_ith_prior(x, temp); break; case PRIOR_LAPLACE: for (int j=0; j<n_Words; j++) temp [j] -= 1.0; p_Docs->CV_sub_ith(x, temp); break; } temp_norm = norm_1(temp, n_Words); temp_sum_log =0.0; for (int j=0; j<n_Words; j++) if (temp[j]>0) temp_sum_log += temp[j]*log(temp[j]); temp_sum_log -= temp_norm * log(temp_norm ); quality_change =C_log_C[Cluster[x]]-temp_sum_log - p_Docs->GetNorm(x)*p_Docs->getPrior(x) ; } if (c_ID >=0){ for (int j=0; j<n_Words; j++) temp [j] = Concept_Vector[c_ID][j]; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: p_Docs->ith_add_CV_prior(x, temp); break; case PRIOR_LAPLACE: for (int j=0; j<n_Words; j++) temp [j] += 1.0; p_Docs->ith_add_CV(x, temp); break; } temp_norm = norm_1(temp, n_Words); temp_sum_log =0.0; for (int j=0; j<n_Words; j++) if (temp[j]>0) temp_sum_log += temp[j]*log(temp[j]); temp_sum_log -= temp_norm * log(temp_norm ); quality_change += C_log_C[c_ID]-temp_sum_log + p_Docs->GetNorm(x)*p_Docs->getPrior(x); } delete [] temp; return quality_change / log(2.0);}/* split functions */float Kullback_leibler_k_means::Split_Clusters(Matrix *p_Docs, int worst_vector, float threshold){ int worst_vector_s_cluster_ID, i, j; float change; if(dumpswitch) { cout <<endl<<"- Split the clusters -"<<endl; cout <<"The worst vector "<<worst_vector<<" is in cluster "<<Cluster[worst_vector]; if(n_Class>0) cout<<" and in category "<<label[worst_vector]<<endl; else cout<< endl; } worst_vector_s_cluster_ID = Cluster[worst_vector]; change = delta_X(p_Docs, worst_vector, -1) +Omiga; if(change < fv_threshold) { Cluster[worst_vector] = n_Clusters; n_Clusters ++; ClusterSize[worst_vector_s_cluster_ID] --; for (i=0; i<n_Clusters-1; i++) delete [] old_CV[i]; delete [] old_CV; old_CV = new VECTOR_double[n_Clusters]; for (i = 0; i < n_Clusters; i++) old_CV[i] = new float[n_Words]; new_ClusterSize = new int [n_Clusters]; new_cluster_quality = new float [n_Clusters]; new_prior = new float [n_Clusters]; new_Concept_Vector = new (float*) [n_Clusters]; for (i = 0; i <n_Clusters; i++) new_Concept_Vector[i] = new float [n_Words]; for (i = 0; i < n_Clusters-1; i++) { for (j = 0; j < n_Words; j++) new_Concept_Vector[i][j] = Concept_Vector[i][j]; new_cluster_quality[i] = cluster_quality[i]; new_prior[i] = prior[i]; new_ClusterSize[i] = ClusterSize[i]; } new_ClusterSize[n_Clusters-1]=1; new_cluster_quality [n_Clusters-1]=0; delete [] cluster_quality; C_log_C=cluster_quality = new_cluster_quality; delete [] ClusterSize; ClusterSize = new_ClusterSize; for (i = 0; i < n_Clusters-1; i++) delete[] Concept_Vector[i]; delete [] Concept_Vector; Concept_Vector = new_Concept_Vector; switch (laplacian) { case NOLAPLACE: case CENTER_LAPLACE: for(j=0; j<n_Words; j++) Concept_Vector[n_Clusters-1][j]=0.0; p_Docs->ith_add_CV_prior(worst_vector, Concept_Vector[n_Clusters-1]); p_Docs->CV_sub_ith_prior(worst_vector, Concept_Vector[worst_vector_s_cluster_ID]); break; case PRIOR_LAPLACE: for (j=0; j< n_Words; j++) Concept_Vector[n_Clusters-1][j] =1.0; p_Docs->ith_add_CV(worst_vector, Concept_Vector[n_Clusters-1]); for (j=0; j< n_Words; j++) Concept_Vector[worst_vector_s_cluster_ID][j] -=1.0; p_Docs->CV_sub_ith(worst_vector, Concept_Vector[worst_vector_s_cluster_ID]); break; } new_prior[n_Clusters-1] = p_Docs->getPrior(worst_vector); new_prior[worst_vector_s_cluster_ID] = norm_1(Concept_Vector[worst_vector_s_cluster_ID], n_Words); delete [] prior; prior = new_prior; compute_sum_log(n_Clusters-1); compute_sum_log(worst_vector_s_cluster_ID); new_Sim_Mat = new (float *)[n_Clusters]; for ( j = 0; j < n_Clusters; j++) new_Sim_Mat[j] = new float[n_Docs]; for (i=0; i<n_Clusters-1; i++) for (j=0; j<n_Docs; j++) new_Sim_Mat[i][j] = Sim_Mat[i][j] ; p_Docs->Kullback_leibler(Concept_Vector[n_Clusters-1], new_Sim_Mat[n_Clusters-1], laplacian, prior[n_Clusters-1]); p_Docs->Kullback_leibler(Concept_Vector[worst_vector_s_cluster_ID], new_Sim_Mat[worst_vector_s_cluster_ID], laplacian, prior[worst_vector_s_cluster_ID]); for (i=0; i<n_Clusters-1; i++) delete [] Sim_Mat[i]; /* there is some problem in linux about this following deletion */ delete [] Sim_Mat; Sim_Mat = new_Sim_Mat; if (dumpswitch) generate_Confusion_Matrix (label, n_Class); if ( f_v_times > 0) { new_quality_change_mat = new (float *)[n_Clusters]; for ( j = 0; j < n_Clusters; j++) new_quality_change_mat [j] = new float[n_Docs]; update_quality_change_mat (p_Docs, worst_vector_s_cluster_ID); for (i=0; i<n_Clusters-1; i++) for (j=0; j<n_Docs; j++) new_quality_change_mat [i][j] = quality_change_mat [i][j]; for (i=0; i<n_Clusters-1; i++) delete [] quality_change_mat[i]; delete []quality_change_mat; quality_change_mat = new_quality_change_mat; update_quality_change_mat (p_Docs, n_Clusters-1); } } else cout<<endl<<"This splitting doesn't benefit objective function. No splitting."<<endl; return change;}/* tool functions */void Kullback_leibler_k_means::remove_Empty_Clusters() /* remove empty clusters after general k-means and fv because fv gets rid of empty clusters*/{ int *cluster_label; int i,j, tmp_label; cluster_label = new int [n_Clusters]; tmp_label =0; empty_Clusters=0; for(i=0; i<n_Clusters; i++) { if(ClusterSize[i] == 0) { empty_Clusters ++; } else { cluster_label[i]= tmp_label; tmp_label++; } } if(empty_Clusters !=0) { cout<<empty_Clusters<<" empty clusters generated."<<endl<<"Cluster IDs have been changed."<<endl; int k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { Cluster[i] = cluster_label[Cluster[i]]; i++; } k++; } for (i=0; i< n_Clusters; i++) if ((ClusterSize[i] != 0) && (cluster_label[i] != i)) { for (j=0; j<n_Words; j++) Concept_Vector[cluster_label[i]][j] = Concept_Vector[i][j]; for (j=0; j<n_Docs; j++) Sim_Mat[cluster_label[i]][j] = Sim_Mat[i][j]; diff[cluster_label[i]] = diff[i]; ClusterSize[cluster_label[i]] = ClusterSize[i]; cluster_quality[cluster_label[i]]=cluster_quality[i]; prior[cluster_label[i]] = prior[i]; } for(i= n_Clusters- empty_Clusters ; i< n_Clusters; i++) { delete [] Concept_Vector[i]; delete [] old_CV[i]; delete [] Sim_Mat[i]; } n_Clusters -=empty_Clusters; } delete [] cluster_label;}float Kullback_leibler_k_means::verify_obj_func(Matrix *p_Docs, int n_clus){ int i, k; float value =0.0; k=0; for (i = 0; i < n_Docs; i++) { while (i<empty_Docs_ID[k]) { value += p_Docs->getPrior(i) * p_Docs->Kullback_leibler(Concept_Vector[Cluster[i]], i, laplacian, prior[Cluster[i]]); i++; } k++; } return value/log(2.0)+n_clus*Omiga;}int Kullback_leibler_k_means::find_worst_vectors(bool dumped){ int i, k, worst_vector=0; float min_sim, *worst_vec_per_cluster; worst_vec_per_cluster = new float[n_Clusters]; min_sim = 0; for (i = 0; i <n_Clusters; i++) worst_vec_per_cluster[i] = 0; k=0; for (i=0; i<n_Docs; i++) { while (i<empty_Docs_ID[k]) { if ( Sim_Mat[Cluster[i]][i] > worst_vec_per_cluster[Cluster[i]]) worst_vec_per_cluster[Cluster[i]] =Sim_Mat[Cluster[i]][i]; if ( Sim_Mat[Cluster[i]][i] > min_sim) { worst_vector =i; min_sim = Sim_Mat[Cluster[i]][i] ; } i++; } k++; } for (i = 0; i <n_Clusters; i++) { if (ClusterSize[i]==0) { if(dumpswitch) cout <<"Cluster "<<i<<" is empty."<<endl; } else { if(dumpswitch && !dumped) { cout<<"#"<<i<<" : quality/# doc/av_quality/L1_norm/sum_log/Worst dp = "; cout<<cluster_quality[i]<<"/"<<ClusterSize[i]<<"/"<<cluster_quality[i]/ClusterSize[i]; cout<<"/"<<prior[i]<<"/"<<C_log_C[i]; cout<<"/"<<worst_vec_per_cluster[i]<<endl; } } } if(dumpswitch && !dumped) { cout<<"Vector "<<worst_vector<< " has the worst Kullback-leibler "<<min_sim<<endl; } delete [] worst_vec_per_cluster; return worst_vector;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -