📄 sparsematrix.cc

📁 gmeans-- Clustering with first variation and splitting 文本聚类算法Gmeans ,使用了3种相似度函数,cosine,euclidean ,K
💻 CC
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Implementation of SparseMatrix class * Copyright (c) 2003, Yuqiang Guan */#include <assert.h>#include <stdlib.h>#include <iostream.h>#include <math.h>#include "SparseMatrix.h"SparseMatrix::SparseMatrix(int row, int col, int nz, float *val, int *rowind, int *colptr) : Matrix (row, col){  n_row = row;  n_col = col;  n_nz  = nz;  vals = val;  rowinds = rowind;  colptrs = colptr;  memory_used += (n_row+n_col)*sizeof(int)+n_nz*sizeof(float);}SparseMatrix::~SparseMatrix(){  if (norm != NULL)    delete[] norm;  if (L1_norm != NULL)    delete[] L1_norm;}float SparseMatrix::operator()(int i, int j) const{  assert(i >= 0 && i < n_row && j >= 0 && j < n_col);    for (int t = colptrs[j]; t < colptrs[j+1]; t++)    if (rowinds[t] == i) return vals[t];  return 0.0;}float SparseMatrix::dot_mult(float *v, int i)  /*compute the dot-product between the ith vector (in the sparse matrix) with vector v    result is returned.   */{  float result=0.0;  for (int j = colptrs[i]; j < colptrs[i+1]; j++)    result += vals[j] * v[rowinds[j]];  return result;}void SparseMatrix::trans_mult(float *x, float *result)   /*compute the dot-product between every vector (in the sparse matrix) with vector v    results are stored in array 'result'.   */{  for (int i = 0; i < n_col; i++)      result[i] = dot_mult(x, i);}float SparseMatrix::squared_dot_mult(float *v, int i)  /*compute the dot-product between the ith vector (in the sparse matrix) with vector v    result is returned.   */{  float result=0.0;  for (int j = colptrs[i]; j < colptrs[i+1]; j++)    result += vals[j] * v[rowinds[j]];  return result*result;}void SparseMatrix::squared_trans_mult(float *x, float *result)   /*compute the dot-product between every vector (in the sparse matrix) with vector v    results are stored in array 'result'.   */{  for (int i = 0; i < n_col; i++)    result[i] = squared_dot_mult(x, i);}void SparseMatrix::A_trans_A(int flag, int * index, int *pointers, float **A_t_A, int & nz_counter)/* computes A'A given doc_IDs in the same cluster; index[] contains doc_IDs for all docs,      pointers[] gives the range of doc_ID belonging to the same cluster;     the resulting matrix is stored in dense format A_t_A( d by d matrix)     The right SVs of A are the corresponding EVs of A'A      Notice that Gene expression matrix is usually n by d where n is #genes; this matrix     format is different from that of document matrix.      In the main memory the matrix is stored in the format of document matrix.  */{  int i, j, k, l, clustersize = pointers[1]-pointers[0];     for (i=0; i<n_row; i++)    for (j=0; j<n_row; j++)      A_t_A[i][j] = 0;  nz_counter=0;    if (flag >0)    {      for (k=0; k<clustersize; k++)	for (l = colptrs[index[k+pointers[0]]]; l < colptrs[index[k+pointers[0]]+1]; l++)	  for (j = colptrs[index[k+pointers[0]]]; j < colptrs[index[k+pointers[0]]+1]; j++)	    {	      A_t_A[rowinds[l]][rowinds[j]] += vals[l]*vals[j];	    }    }  else    {      for (k=0; k<clustersize; k++)	for (l = colptrs[k+pointers[0]]; l < colptrs[k+pointers[0]+1]; l++)	  for (j = colptrs[k+pointers[0]]; j < colptrs[k+pointers[0]+1]; j++)	    {	      A_t_A[rowinds[l]][rowinds[j]] += vals[l]*vals[j];	    }    }  for (i=0; i<n_row; i++)    for (j=0; j<n_row; j++)      if (A_t_A[i][j] >0)	nz_counter++;}void SparseMatrix::dense_2_sparse(int* AtA_colptr, int *AtA_rowind, float *AtA_val, float **A_t_A){  int i, l, j, k;  k=0;  AtA_colptr[0] = 0;  for (j=0; j<n_row; j++)    {      l=0;      for (i=0; i<n_row; i++)	if (A_t_A[i][j] > 0)	  {	    AtA_val[k] = A_t_A[i][j];	    AtA_rowind[k++] = i;	    l++;	  }      AtA_colptr[j+1] = AtA_colptr[j] + l;    }}void SparseMatrix::right_dom_SV(int *cluster, int *cluster_size, int n_Clusters, float ** CV, float *cluster_quality, int flag)  /*      flag == -1, then update all the centroids and the cluster qualities;      0 <= flag < n_Clusters , then update the centroid and the cluster quality specified by 'flag';     n_Clusters <= flag <2*n_Clusters, then update the cluster quality specified by 'flag-n_Clusters';  */{  int i, *pointer = new int[n_Clusters+1], *index = new int[n_col], *range =new int[2], nz_counter;  int *AtA_rowind =NULL, *AtA_colptr=NULL;  float *AtA_val =NULL;  float **A_t_A;  A_t_A =new (float*) [n_row];  for (i=0; i<n_row; i++)    A_t_A [i] = new float [n_row];  pointer[0] =0;  for (i=1; i<n_Clusters; i++)    pointer[i] = pointer[i-1] + cluster_size[i-1];  for (i=0; i<n_col; i++)    {      index[pointer[cluster[i]]] = i;      pointer[cluster[i]] ++;    }  pointer[0] =0;  for (i=1; i<=n_Clusters; i++)    pointer[i] = pointer[i-1] + cluster_size[i-1];  if (flag <0) // compute eigval and eigvec for all clusters    {      for (i=0; i<n_Clusters; i++)	{	  range[0] = pointer[i];	  range[1] = pointer[i+1];	  A_trans_A(1, index, range, A_t_A, nz_counter);	  AtA_val = new float [nz_counter];	  AtA_rowind = new int [nz_counter];	  AtA_colptr = new int [n_row+1];	  dense_2_sparse(AtA_colptr, AtA_rowind, AtA_val, A_t_A);	  /*	  for (int l=0; l< n_row; l++)	    {	      for (int j=0; j< n_row; j++)		cout<<A_t_A[l][j]<<" ";	      cout<<endl;	    }	 	  for (int l=0; l<= n_row; l++)	    cout<<AtA_colptr[l]<<" ";	  cout<<endl;	  for (int l=0; l<nz_counter; l++)	    cout<<AtA_rowind[l]<<" ";	  cout<<endl;	  for (int l=0; l<nz_counter; l++)	    cout<<AtA_val[l]<<" ";	  cout<<endl<<endl;	  */	  power_method(AtA_val, AtA_rowind, AtA_colptr, n_row, CV[i], CV[i], cluster_quality[i]);	  //for (int l=0; l<n_row; l++)	  //  cout<<CV[i][l]<<" ";	  //cout<<cluster_quality[i]<<endl;	  delete [] AtA_val;	  delete [] AtA_rowind;	  delete [] AtA_colptr;	}      for (i=0; i< n_row; i++)	delete [] A_t_A[i];      delete [] A_t_A;    }   else if ((flag >=0) && (flag<n_Clusters)) //compute eigval and eigvec for cluster 'flag'    {      range[0] = pointer[flag];      range[1] = pointer[flag+1];      A_trans_A(1, index, range, A_t_A, nz_counter);      AtA_val = new float [nz_counter];      AtA_rowind = new int [nz_counter];      AtA_colptr = new int [n_row+1];      dense_2_sparse(AtA_colptr, AtA_rowind, AtA_val, A_t_A);      for (i=0; i< n_row; i++)	delete [] A_t_A[i];      delete [] A_t_A;      power_method(AtA_val, AtA_rowind, AtA_colptr, n_row, CV[flag], CV[flag], cluster_quality[flag]);      delete [] AtA_val;      delete [] AtA_rowind;      delete [] AtA_colptr;    }  else if ((flag >= n_Clusters) && (flag <= 2*n_Clusters))    // compute eigval ONLY for cluster 'flag-n_Clusters', eigvec for  cluster 'flag-n_Clusters' no change    {      range[0] = pointer[flag-n_Clusters];      range[1] = pointer[flag-n_Clusters+1];      A_trans_A(1, index, range, A_t_A, nz_counter);      AtA_val = new float [nz_counter];      AtA_rowind = new int [nz_counter];      AtA_colptr = new int [n_row+1];      dense_2_sparse(AtA_colptr, AtA_rowind, AtA_val, A_t_A);      for (i=0; i< n_row; i++)	delete [] A_t_A[i];      delete [] A_t_A;      power_method(AtA_val, AtA_rowind, AtA_colptr, n_row, NULL, CV[flag-n_Clusters], cluster_quality[flag-n_Clusters]);      delete [] AtA_val;      delete [] AtA_rowind;      delete [] AtA_colptr;    }  delete [] pointer;  delete [] index;  delete [] range;}float SparseMatrix::euc_dis(float *v, int i, float norm_v)  /* Given L2-norms of the vecs and v, norm[i] and norm_v,     compute the squared Euc-dis between ith vec in the matrix and v,     result is returned.     Take advantage of (x-c)^T (x-c) = x^T x - 2 x^T c + c^T c  */{  float result=0.0;  for (int j = colptrs[i]; j < colptrs[i+1]; j++)    result += vals[j] * v[rowinds[j]];  result *= -2.0;  result += norm[i]+norm_v;  return result;}void SparseMatrix::euc_dis(float *x, float norm_x, float *result)  /* Given L2-norms of the vecs and x, norm[i] and norm_x,     compute the squared Euc-dis between each vec in the matrix with x,       results are stored in array 'result'.      Take advantage of (x-c)^T (x-c) = x^T x - 2 x^T c + c^T c  */{  for (int i = 0; i < n_col; i++)    result[i] = euc_dis(x, i, norm_x);}float SparseMatrix::Kullback_leibler(float *x, int i, int laplace, float L1norm_x)  /* Given the L1_norms of vec[i] and x, (vec[i] need be normalized before function-call     compute KL divergence between vec[i] in the matrix with x,     result is returned.      Take advantage of KL(p, q) = \sum_i p_i log(p_i) - \sum_i p_i log(q_i) = norm[i] - \sum_i p_i log(q_i)     the KL is in unit of nats NOT bits.  */{  float result=0.0, row_inv_alpha=alpha/n_row;  if (priors[i] >0)    {      switch(laplace)	{	case NOLAPLACE:	  for (int j = colptrs[i]; j < colptrs[i+1]; j++)	    {	      if(x[rowinds[j]] >0.0)		result += vals[j] * log(x[rowinds[j]]);	      else		return HUGE_NUMBER; // if KL(vec[i], x) is inf. give it a huge number 1.0e20	    }     	  result = norm[i]-result+log(L1norm_x);	  break;	case CENTER_LAPLACE:	  // this vector alpha is alpha (given by user) divided by |Y|,	  //row_inv_alpha is to make computation faster	  for (int j = colptrs[i]; j < colptrs[i+1]; j++)	    result += vals[j] * log(x[rowinds[j]]+row_inv_alpha*L1norm_x) ;	  	  result = norm[i]-result+log((1+alpha)*L1norm_x);	  break;	case PRIOR_LAPLACE:	  // this vector alpha is alpha (given by user) divided by |X|*|Y|,	  //row_alpha is its L1-norm	  for (int j = colptrs[i]; j < colptrs[i+1]; j++)
12 下一页
💿 文件大小 71 K
👤 上传用户 tianlin4431
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#Clustering #euclidean #variation #splitting
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -