📄 cluster.c

📁 聚类分析的源码集
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
    for (ii = 0; ii < nobjects; ii++)    /* Calculate the distances */    { int i = order[ii];      int jnow = clusterid[i];      if (cn[jnow]>1)      { /* No reassignment if that would lead to an empty cluster */        /* Treat the present cluster as a special case */        double distance =          metric(ndata,data,cdata,mask,cmask,weight,i,jnow,transpose);        int j;        for (j = 0; j < jnow; j++)        { double tdistance =            metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);          if (tdistance < distance)          { distance = tdistance;            cn[clusterid[i]]--;            clusterid[i] = j;            cn[j]++;            changed = 1;          }        }        for (j = jnow+1; j < nclusters; j++)        { double tdistance =            metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);          if (tdistance < distance)          { distance = tdistance;            cn[clusterid[i]]--;            clusterid[i] = j;            cn[j]++;            changed = 1;          }        }      }    }    /* compare to the saved clustering solution */    same = 1;    for (ii = 0; ii < nobjects; ii++)    { if (savedids[ii] != clusterid[ii])      { same = 0;        break;   /* No point in checking the other ids */      }    }  } while (changed && !same);  free (savedids);  free (order);  free (cn);  return;}/* *********************************************************************** */void CALL kcluster (int nclusters, int nrows, int ncolumns,  double** data, int** mask, double weight[], int transpose,  int npass, char method, char dist,  int clusterid[], double** cdata, double* error, int* ifound)/*Purpose=======The kcluster routine performs k-means or k-median clustering on a given set ofelements, using the specified distance measure. The number of clusters is givenby the user. Multiple passes are being made to find the optimal clusteringsolution, each time starting from a different initial clustering.Arguments=========nclusters  (input) intThe number of clusters to be found.data       (input) double array, dimension( nrows,ncolumns )The array containing the data of the elements to be clustered (i.e., the geneexpression data).mask       (input) int array, dimension( nrows,ncolumns )This array shows which data values are missing. Ifmask[i][j] == 0, then data[i][j] is missing.nrows     (input) intThe number of rows in the data matrix, equal to the number of genes.ncolumns  (input) intThe number of columns in the data matrix, equal to the number of microarrays.weight (input) double array, dimension( n )The weights that are used to calculate the distance.transpose  (input) intIf transpose==0, the rows of the matrix are clustered. Otherwise, columnsof the matrix are clustered.npass      (input) intThe number of times clustering is performed. Clustering is performed npasstimes, each time starting from a different (random) initial assignment of genes to clusters. The clustering solution with the lowest within-cluster sumof distances is chosen.If npass==0, then the clustering algorithm will be run once, where the initialassignment of elements to clusters is taken from the clusterid array.method     (input) charDefines whether the arithmetic mean (method=='a') or the median(method=='m') is used to calculate the cluster center.dist       (input) charDefines which distance measure is used, as given by the table:dist=='e': Euclidean distancedist=='h': Harmonically summed Euclidean distancedist=='b': City-block distancedist=='c': correlationdist=='a': absolute value of the correlationdist=='u': uncentered correlationdist=='x': absolute uncentered correlationdist=='s': Spearman's rank correlationdist=='k': Kendall's tauFor other values of dist, the default (Euclidean distance) is used.clusterid  (output; input) int array, dimension( nrows or ncolumns )The cluster number to which a gene or microarray was assigned. If npass==0,then on input clusterid contains the initial clustering assignment from whichthe clustering algorithm starts. On output. it contains the clustering solutionthat was found.cdata      (output) double array, dimension( nclusters,ncolumns ) (transpose==0)                               or dimension( nrows, nclusters) (transpose==1)This array contains the center of each cluster, as definedas the average of the elements for each cluster (ifmethod=='a') or as the median (if method=='m').error      (output) doubleThe sum of distances to the cluster center of each item in the optimal k-meansclustering solution that was found.ifound     (output) intThe number of times the optimal clustering solution wasfound. The value of ifound is at least 1; its maximum value is npass.========================================================================*/{ const int nobjects = (transpose==0) ? nrows : ncolumns;  const int ndata = (transpose==0) ? ncolumns : nrows;  void (*getclustercenter)    (int,int,int,double**,int**,int[],double**,int**,int);  double (*metric)    (int,double**,double**,int**,int**,const double[],int,int,int);  const int init_given = (npass==0) ? 1 : 0;  int i;  int** cmask;  int** tcmask;  double** tcdata;  int ipass;  int* tclusterid;  int* mapping;  int* savedinitialid = NULL;  if (nobjects < nclusters)  { *ifound = 0;    return;  }  /* More clusters asked for than objects available */  /* First initialize the random number generator */  initran();  /* Set the function to find the centroid as indicated by method */  if (method == 'm') getclustercenter = &getclustermedian;  else getclustercenter = &getclustermean;  /* Set the metric function as indicated by dist */  setmetric (dist, &metric);  /* Set the result of the first pass as the initial best clustering solution */  *ifound = 1;  /* Find out if the user specified an initial clustering */  if (init_given)  /* Save the initial clustering specified by the user */  { savedinitialid = malloc(nobjects*sizeof(int));    for (i = 0; i < nobjects; i++) savedinitialid[i] = clusterid[i];  }  if (transpose==0)  { cmask = malloc(nclusters*sizeof(int*));    for (i = 0; i < nclusters; i++) cmask[i] = malloc(ndata*sizeof(int));  }  else  { cmask = malloc(ndata*sizeof(int*));    for (i = 0; i < ndata; i++) cmask[i] = malloc(nclusters*sizeof(int));  }  *error = 0.;  emalg(nclusters, nrows, ncolumns, data, mask, weight, transpose, init_given,    getclustercenter, metric, clusterid, cdata, cmask);  for (i = 0; i < nobjects; i++)  { int j = clusterid[i];    *error += metric(ndata, data, cdata, mask, cmask, weight, i, j, transpose);  }  if (transpose==0)    for (i = 0; i < nclusters; i++) free(cmask[i]);  else    for (i = 0; i < ndata; i++) free(cmask[i]);  free(cmask);  if (npass==0) return;  /* Create temporary space for cluster centroid information */  if (transpose==0)  { tcmask = malloc(nclusters*sizeof(int*));    tcdata = malloc(nclusters*sizeof(double*));    for (i = 0; i < nclusters; i++)    { tcmask[i] = malloc(ndata*sizeof(int));      tcdata[i] = malloc(ndata*sizeof(double));    }  }  else  { tcmask = malloc(ndata*sizeof(int*));    tcdata = malloc(ndata*sizeof(double*));    for (i = 0; i < ndata; i++)    { tcmask[i] = malloc(nclusters*sizeof(int));      tcdata[i] = malloc(nclusters*sizeof(double));    }  }  tclusterid = malloc(nobjects*sizeof(int));  mapping = malloc(nclusters*sizeof(int));  for (ipass = 1; ipass < npass; ipass++)  { double tssin = 0.;    int same = 1;    if (init_given)      for (i = 0; i < nobjects; i++) tclusterid[i] = savedinitialid[i];    emalg(nclusters, nrows, ncolumns, data, mask, weight, transpose, init_given,      getclustercenter, metric, tclusterid, tcdata, tcmask);    for (i = 0; i < nclusters; i++) mapping[i] = -1;    for (i = 0; i < nobjects; i++)    { int j = tclusterid[i];      if (mapping[j] == -1) mapping[j] = clusterid[i];      else if (mapping[j] != clusterid[i]) same = 0;      tssin +=        metric(ndata, data, tcdata, mask, tcmask, weight, i, j, transpose);    }    if (same) (*ifound)++;    else if (tssin < *error)    { int j;      *ifound = 1;      *error = tssin;      for (i = 0; i < nobjects; i++) clusterid[i] = tclusterid[i];      if (transpose==0)      { for (i = 0; i < nclusters; i++)          for (j = 0; j < ndata; j++)            cdata[i][j] = tcdata[i][j];      }      else      { for (i = 0; i < ndata; i++)        { for (j = 0; j < nclusters; j++)            cdata[i][j] = tcdata[i][j];        }      }    }  }  /* Deallocate temporarily used space */  free(mapping);  free(tclusterid);  if (savedinitialid) free(savedinitialid);  if (transpose==0)  { for (i = 0; i < nclusters; i++)    { free(tcmask[i]);      free(tcdata[i]);    }  }  else  { for (i = 0; i < ndata; i++)    { free(tcmask[i]);      free(tcdata[i]);    }  }  free(tcmask);  free(tcdata);  return;}/* *********************************************************************** */void CALL kmedoids (int nclusters, int nelements, double** distance,  int npass, int clusterid[], double* error, int* ifound)/*Purpose=======The kmedoids routine performs k-medoids clustering on a given set of elements,using the distance matrix and the number of clusters passed by the user.Multiple passes are being made to find the optimal clustering solution, eachtime starting from a different initial clustering.Arguments=========nclusters  (input) intThe number of clusters to be found.nelements  (input) intThe number of elements to be clustered.distmatrix (input) double array, ragged  (number of rows is nelements, number of columns is equal to the row number)The distance matrix. To save space, the distance matrix is given in theform of a ragged array. The distance matrix is symmetric and has zeroson the diagonal. See distancematrix for a description of the content.npass      (input) intThe number of times clustering is performed. Clustering is performed npasstimes, each time starting from a different (random) initial assignment of genesto clusters. The clustering solution with the lowest within-cluster sum ofdistances is chosen.If npass==0, then the clustering algorithm will be run once, where the initialassignment of elements to clusters is taken from the clusterid array.clusterid  (output; input) int array, dimension( nelements )On input, if npass==0, then clusterid contains the initial clustering assignmentfrom which the clustering algorithm starts; all numbers in clusterid should bebetween zero and nelements-1 inclusive. If npass!=0, clusterid is ignored oninput.On output, clusterid contains the clustering solution that was found: clusteridcontains the number of the cluster to which each item was assigned. On output,the number of a cluster is defined as the item number of the centroid of thecluster.error      (output) doubleThe sum of distances to the cluster center of each item in the optimal k-medoidsclustering solution that was found.ifound     (output) intThe number of times the optimal clustering solution wasfound. The value of ifound is at least 1; its maximum value is npass.========================================================================*/{ int i, j, k, icluster, ipass;  int* tclusterid;  int* centroids;  int* savedids;  double* errors;  int same, changed;  int iteration = 0;  int period = 10;  /* needed to check for periodic behavior */  if (nelements < nclusters)  { *ifound = 0;    return;  } /* More clusters asked for than elements available */  centroids = malloc(nclusters*sizeof(int));  savedids = malloc(nelements*sizeof(int));  errors = malloc(nclusters*sizeof(double));  /* Set the result of the first pass as the initial best clustering solution */  *ifound = 1;  /* Find out if the user specified an initial clustering */  if (npass)  { initran(); /* First initialize the random number generator */    randomassign (nclusters, nelements, clusterid);    /* Ready for the first run */  }  *error = 0.;  do /* Start the loop */  { if (iteration % period == 0)    { /* save the current clustering solution */      for (i = 0; i < nelements; i++) savedids[i] = clusterid[i];      period *= 2;    }    iteration++;    /* Find the center */    getclustermedoid (nclusters, nelements, distance, clusterid, centroids, errors);    changed = 0;    for (i = 0; i < nelements; i++)    /* Find the closest cluster */    { double d = DBL_MAX;      for (icluster = 0; icluster < nclusters; icluster++)      { double td;        j = centroids[icluster];        if (i==j)        { d = 0.0;          clusterid[i] = icluster;          changed = 1;          break;        }        td = (i > j) ? distance[i][j] : distance[j][i];        if (td < d)        { d = td;          clusterid[i] = icluster;          changed = 1;        }      }    }    /* compare to the saved clustering solution (periodicity check) */    same = 1;    for (i = 0; i < nelements; i++)    { if (savedids[i] != clusterid[i])      { same = 0;        break;   /* No point in checking the other ids */      }    }  } while (changed && !same);  for (i = 0; i < nelements; i++)  { const int j = centroids[clusterid[i]];    /* Set the cluster number to the item number of the cluster centroid */    clusterid[i] = j;    if (i==j) continue;    *error += (i > j) ? distance[i][j] : distance[j][i];  }  if (npass==0)  /* Deterministic result depending on the specified initial clustering */  { free(savedids);    free(centroids);    free(errors);    return; /* Done for today */  }  tclusterid = malloc(nelements*sizeof(int));  for (ipass = 1; ipass < npass; ipass++)  { double terror = 0.0;
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -