⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 camjul97.c

📁 这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer
💻 C
📖 第 1 页 / 共 2 页
字号:
  *ret_nmx    = nmx;  return;}/* Function: OldPrintGJMMatrix() *  * Purpose:  (debugging, basically): print out Graeme's *           joint probability matrices in log odds integer form. *            */voidOldPrintGJMMatrix(FILE *fp, float **jmx, float *rnd, int N) {  int r, c;  fprintf(fp, "  ");  for (c = 0; c < N; c++)    fprintf(fp, " %c  ", Alphabet[c]);  fprintf(fp, "\n");    for (r = 0; r < N; r++)    {      fprintf(fp, "%c ", Alphabet[r]);      for (c = 0; c < N; c++) 	fprintf(fp, "%3d ", 		(int) (10. * sreLOG2(jmx[r][c] / (rnd[r] * rnd[c]))));      fprintf(fp, "\n");    }}#endif /* SRE_REMOVED*//* Function: Joint2SubstitutionMatrix() *  * Purpose:  Convert a joint probability matrix to a substitution *           matrix. *            *           Convention here for substitution matrices is *           smx[r][c] = r->c = P(c|r). *            *           We obtain the substitution matrix from the following logic: *           P(rc) = P(c|r) P(r);     *           P(r)  = \sum_c P(rc); *           thus P(c|r) = P(rc) / \sum_c P(rc) *  * Args:     jmx - NxN P(rc) joint probability matrix  *           smx - NxN P(c|r) substitution matrix, alloced in caller *           N   - size of matrices; typically Alphabet_size *            * Return:   (void) *           smx is filled in.           */voidJoint2SubstitutionMatrix(float **jmx, float **smx, int N){  float pr;			/* P(r) = \sum_c P(rc)        */  int   r,c;			/* counters for rows, columns */    for (r = 0; r < N; r++)    {      for (pr = 0., c = 0; c < N; c++)	pr += jmx[r][c];      for (c = 0; c < N; c++)	smx[r][c] = jmx[r][c] / pr;    }}#ifdef SRE_REMOVED/* Function: BlosumWeights() *  * Purpose:  Assign weights to a set of aligned sequences *           using the BLOSUM rule: *             - do single linkage clustering at some pairwise identity *             - in each cluster, give each sequence 1/clustsize *               total weight. *                * Args:     aseqs - alignment *           N     - number of seqs in alignment *           maxid - fractional identity (e.g. 0.62 for BLOSUM62) *           clust - [0..nseq-1] vector of cluster assignments, filled here (or NULL) *           ret_nc - total number of clusters found  (or pass NULL) */               voidBlosumWeights(char **aseqs, AINFO *ainfo, float maxid, int *clust,int *ret_nc){  float         **dmx;          /* difference matrix */  struct phylo_s *tree;         /* UPGMA tree        */  float           mindiff;	/* minimum distance between clusters */  int             c;		/* counter for clusters */  struct intstack_s *stack;  int             node;  int             i;  mindiff = 1.0 - maxid;				/* first we do a difference matrix */  MakeDiffMx(aseqs, ainfo->nseq, &dmx);				/* then we build a tree */  Cluster(dmx, ainfo->nseq, CLUSTER_MIN, &tree);  /* Find clusters below mindiff.   * The rule is:    *     -traverse the tree   *     -if the parent is > mindiff and current < mindiff, then   *      make current node a cluster.   */  for (i = 0; i < ainfo->nseq; i++)    {      ainfo->sqinfo[i].weight = 1.0;      ainfo->sqinfo[i].flags |= SQINFO_WGT;    }  stack = InitIntStack();  PushIntStack(stack, 0);	/* push root on stack to start */  c = 0;  while (PopIntStack(stack, &node))    {      if ((node == 0 || tree[tree[node].parent-ainfo->nseq].diff > mindiff) &&	  tree[node].diff < mindiff)	{			/* we're at a cluster */	  for (i = 0; i < ainfo->nseq;  i++)	    if (tree[node].is_in[i]) 	      {		ainfo->sqinfo[i].weight = 1.0 / (float) tree[node].incnum;		if (clust != NULL) clust[i] = c;	      }	  c++;	}      else			/* we're not a cluster, keep traversing */	{	  if (tree[node].right >= ainfo->nseq)	    PushIntStack(stack, tree[node].right - ainfo->nseq);	  else 	    {	      c++;	      if (clust != NULL) clust[tree[node].right] = c; /* single seq, wgt 1.0 */	    }	  if (tree[node].left >= ainfo->nseq)	    PushIntStack(stack, tree[node].left - ainfo->nseq);	  else	    {	      c++;	      if (clust != NULL) clust[tree[node].left] = c;	    }	}    }  FreeIntStack(stack);  FreePhylo(tree, ainfo->nseq);  FMX2Free(dmx);  if (ret_nc != NULL) *ret_nc = c;  return;}#endif#ifdef SRE_REMOVED/* Function: calc_probq() *  * Purpose:  Calculate the posterior probability distribution *           P(q | a_j) for every column j in the alignment *           and every matrix choice q. *            *           Probabilistic, based on a star topology. *           Uses a BLOSUM-like rule to cluster the sequences in *           the alignment into groups with some seq identity (62%). *           Finds the consensus (majority rule) residue in *           each cluster as the representative. *           Then P(q | col) comes by Bayes: *                 = (P(col | q) P(q) / Z *           where the likelihood *                 P(col | q)  = \sum_b [\prod_i P(a_i | q,b)] P(b | q)  *             log P(col | q) = \logsum_b P(b|q) + \sum_i \log(P(a_i | q,b)) *            * Args:     aseqs - alignment *           ainfo - optional info for alignment *           mx    - conditional probability matrices [0..nmx-1][root b][x] *           bprior- root priors [0..nmx-1][root b]  *           qprior- prior prob distribution over matrices  *           nmx   - number of matrices *           probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1] *                   alloc'ed in called, filled in here. *                    * Return:   (void) *           probq is filled in.                    */static void calc_probq(char **aseqs, AINFO *ainfo, float ***mx, float **bprior, 	   float *qprior, int nmx, float **probq){  int q;			/* counter over matrices  */  int a1;			/* counter over sequences */  int j;			/* counter over columns   */  int  *clust;                  /* assignment of seqs to clusters 0..nseq-1 */  int   nclust;			/* number of clusters */  float *wgt;                   /* weights on seqs, 0..nseq-1 */  int   *sym;                   /* symbol indices in a column */  float  obs[MAXABET];          /* number of symbols observed in a column */  int    i, x;  float  maxc;  float  ngap;  float  bterm[20];		/* intermediate in calculation, over root b's */  int    b;			/* counter over root symbols */  /* Use the BLOSUM rule to calculate weights and clusters   * for sequences in the alignment   */  wgt   = (float *) MallocOrDie (sizeof(float) * ainfo->nseq);  clust = (int *) MallocOrDie (sizeof(int)   * ainfo->nseq);  BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust);  /* Use the BLOSUM rule to calculate a "likelihood" function   * P(column | q) for each column.   */   sym = (int *) MallocOrDie (sizeof(int) * nclust);  for (j = 0; j < ainfo->alen; j++)    {				/* Find majority rule symbols in this col  */      for (i = 0; i < nclust; i++)	{	  FSet(obs, Alphabet_size, 0.);	  ngap = 0.;	  for (a1 = 0; a1 < ainfo->nseq; a1++)	    if (clust[a1] == i)	      if   (isgap(aseqs[a1][j])) ngap += 0.;	      else P7CountSymbol(obs, SymbolIndex(aseqs[a1][j]), 1.0);	  maxc = -1.;	  for (x = 0; x < Alphabet_size; x++)	    if (obs[x] > maxc) { maxc = obs[x]; sym[i] = x; }		/* either if no symbols observed, or more gaps than syms: */	  if (ngap >= maxc) sym[i] = -1; 	}				/* Calculate log likelihood + log prior */      for (q = 0; q < nmx; q++)	{	  for (b = 0; b < 20; b++)	    {	      bterm[b] = bprior[q][b];	      for (i = 0; i < nclust; i++)		if (sym[i] >= 0)		  bterm[b] += log(mx[q][b][sym[i]]);	    }	  probq[j][q] = log(qprior[q]) + FLogSum(bterm, 20);	}      LogNorm(probq[j], nmx);	/* normalize -> gives posterior. */    }  free(sym);  free(wgt);  free(clust);}/* Function: old_calc_probq() OBSOLETE VERSION *  * Purpose:  Calculate the posterior probability distribution *           P(q | a_j) for every column j in the alignment *           and every matrix choice q. *            *           Non-probabilistic. Uses a BLOSUM-like rule to *           find the single best matrix for a column, then *           assigns it a posterior of 1.0. * *           This was version 1: a competitive learning rule, *           posterior either 1.0 or 0.0.  *            * Args:     aseqs - alignment *           ainfo - optional info for alignment *           jmx   - *joint* probability matrices [0..nmx-1][0..19][0..19] *           qprior- prior prob distribution over matrices [UNUSED]   *           nmx   - number of matrices *           probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1] *                   alloc'ed in called, filled in here. *                    * Return:   (void) *           probq is filled in.                    */static void old_calc_probq(char **aseqs, AINFO *ainfo, float ***jmx, float *qprior, 	   int nmx, float **probq){  int q;			/* counter over matrices  */  int a1, a2;			/* counters over sequences */  int j;			/* counter over columns   */  float x;			/* BLOSUM-style objective function */  float maxx;			/* maximum x so far */  int   maxq;			/* maximum q so far */  int  *clust;                  /* assignment of seqs to clusters 0..nseq-1 */  int   nclust;			/* number of clusters */  float *wgt;                   /* weights on seqs, 0..nseq-1 */  int   *sym;                   /* symbol indices in a column */    /* Use the BLOSUM rule to calculate weights and clusters   * for sequences in the alignment   */  wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq);  clust = (int *) MallocOrDie (sizeof(int)   * ainfo->nseq);  BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust);  /* Use the BLOSUM rule to calculate a "likelihood" function   * P(column | q) for each column.   */   sym = (int *) MallocOrDie (sizeof(int) * ainfo->nseq);  for (j = 0; j < ainfo->alen; j++)    {      for (a1 = 0; a1 < ainfo->nseq; a1++)	if (!isgap(aseqs[a1][j]) &&	    strchr(Alphabet, aseqs[a1][j]) != NULL)	  {	    sym[a1] = SYMIDX(aseqs[a1][j]);	    if (sym[a1] >= Alphabet_size) sym[a1] = -1; /* no degenerates */	  }	else sym[a1] = -1;      maxx = -FLT_MAX;      for (q = 0; q < nmx; q++)	{	  x = 0.;	  for (a1 = 0; a1 < ainfo->nseq; a1++)	    for (a2 = 0; a2 < ainfo->nseq; a2++)	      if (sym[a1] >= 0 && sym[a2] >= 0 && clust[a1] != clust[a2])		x += wgt[a1] * wgt[a2] * log(jmx[q][sym[a1]][sym[a2]]);#ifdef SRE_REMOVED	  printf("%% col %3d mx %c x = %f\n", 		 j+1, 'a'+(char)q, x);    #endif	  if (x > maxx) 	    {	      maxx = x;	      maxq = q;	    }	}      FSet(probq[j], nmx, 0.0);      probq[j][maxq] = 1.0;	/* winner-take-all rule */    }	  free(sym);  free(wgt);  free(clust);}/* Function: print_probq() *  * Purpose:  Debugging output. *           probq is the posterior probability P(q | column) of *           a matrix q given an observed alignment column. *           Indexed probq[0..alen-1][0..nmx-1]. */static voidprint_probq(FILE *fp, float **probq, int alen, int nmx){  int c;			/* counter for columns  */  int q;			/* counter for matrices */  fputs("### probq debugging output\n", fp);  fputs("     ", fp);  for (q = 0; q < nmx; q++)    fprintf(fp, "  %c   ", 'a'+(char)q);  fputs("\n", fp);  for (c = 0; c < alen; c++)    {      fprintf(fp, "%4d ", c);      for (q = 0; q < nmx; q++)	fprintf(fp, "%5.3f ", probq[c][q]);      fputs("\n", fp);    }}#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -