📄 prior.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
       * Only activated if X-PR* annotation has been used, in which       * priors are overridden and a single Dirichlet component is       * specified for each column (using structural annotation).       * If X-PR* annotation is not used, which is usually the case,        * the following code has no effect (observe how the real prior        * distributions are copied into tq, mq, iq).       */      if (hmm->tpri != NULL && hmm->tpri[k] >= 0)	{	  if (hmm->tpri[k] >= pri->tnum) Die("X-PRT annotation out of range");	  FSet(tq, pri->tnum, 0.0);	  tq[hmm->tpri[k]] = 1.0;	}      else 	FCopy(tq, pri->tq, pri->tnum);      if (hmm->mpri != NULL && hmm->mpri[k] >= 0)	{	  if (hmm->mpri[k] >= pri->mnum) Die("X-PRM annotation out of range");	  FSet(mq, pri->mnum, 0.0);	  mq[hmm->mpri[k]] = 1.0;	}      else 	FCopy(mq, pri->mq, pri->mnum);      if (hmm->ipri != NULL && hmm->ipri[k] >= 0)	{	  if (hmm->ipri[k] >= pri->inum) Die("X-PRI annotation out of range");	  FSet(iq, pri->inum, 0.0);	  iq[hmm->ipri[k]] = 1.0;	}      else 	FCopy(iq, pri->iq, pri->inum);      /* This is the main line of the code:       */      P7PriorifyTransitionVector(hmm->t[k], pri, tq);      P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, mq, pri->m, NULL);      P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, iq, pri->i, NULL);    }  /* We repeat the above steps just for the final match state, M.   */  if (hmm->mpri != NULL && hmm->mpri[hmm->M] >= 0)    {      if (hmm->mpri[hmm->M] >= pri->mnum) Die("X-PRM annotation out of range");      FSet(mq, pri->mnum, 0.0);      mq[hmm->mpri[hmm->M]] = 1.0;    }  else     FCopy(mq, pri->mq, pri->mnum);  P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, mq, pri->m, NULL);  /* Now we're done. Convert the counts-based HMM to probabilities.   */  Plan7Renormalize(hmm);}/* Function: P7PriorifyEmissionVector() *  * Purpose:  Add prior pseudocounts to an observed  *           emission count vector and renormalize.  * *           Can return the posterior mixture probabilities *           P(q | counts) if ret_mix[MAXDCHLET] is passed. *           Else, pass NULL.   *  * Args:     vec     - the 4 or 20-long vector of counts to modify *           pri     - prior data structure *           num     - pri->mnum or pri->inum; # of mixtures *           eq      - pri->mq or pri->iq; prior mixture probabilities *           e       - pri->i or pri->m; Dirichlet components           *           ret_mix - filled with posterior mixture probabilities, or NULL *                    * Return:   (void) *           The counts in vec are changed and normalized to probabilities. */                  voidP7PriorifyEmissionVector(float *vec, struct p7prior_s *pri, 		       int num, float eq[MAXDCHLET], float e[MAXDCHLET][MAXABET],		       float *ret_mix){  int   x;                      /* counter over vec                     */  int   q;                      /* counter over mixtures                */  float mix[MAXDCHLET];         /* posterior distribution over mixtures */  float totc;                   /* total counts                         */  float tota;                   /* total alpha terms                    */  float xi;                     /* X_i term, Sjolander eq. 41           */  /* Calculate mix[], which is the posterior probability   * P(q | n) of mixture component q given the count vector n   *   * (side effect note: note that an insert vector in a PAM prior   * is passed with num = 1, bypassing pam prior code; this means   * that inserts cannot be mixture Dirichlets...)   * [SRE, 12/24/00: the above comment is cryptic! what the hell does that   *  mean, inserts can't be mixtures? doesn't seem to be true. it    *  may mean that in a PAM prior, you can't have a mixture for inserts,   *  but I don't even understand that. The insert vectors aren't passed   *  with num=1!!]   */  mix[0] = 1.0;  if (pri->strategy == PRI_DCHLET && num > 1)     {      for (q = 0; q < num; q++) 	{	  mix[q] =  eq[q] > 0.0 ? log(eq[q]) : -999.;	  mix[q] += Logp_cvec(vec, Alphabet_size, e[q]);	}      LogNorm(mix, num);      /* now mix[q] is P(component_q | n) */    }  else if (pri->strategy == PRI_PAM && num > 1)     {		/* pam prior uses aa frequencies as `P(q|n)' */      for (q = 0; q < Alphabet_size; q++) 	mix[q] = vec[q];      FNorm(mix, Alphabet_size);    }  /* Convert the counts to probabilities, following Sjolander (1996)    */  totc = FSum(vec, Alphabet_size);  for (x = 0; x < Alphabet_size; x++) {    xi = 0.0;    for (q = 0; q < num; q++) {      tota = FSum(e[q], Alphabet_size);      xi += mix[q] * (vec[x] + e[q][x]) / (totc + tota);    }    vec[x] = xi;  }  FNorm(vec, Alphabet_size);  if (ret_mix != NULL)    for (q = 0; q < num; q++)      ret_mix[q] = mix[q];}/* Function: P7PriorifyTransitionVector() *  * Purpose:  Add prior pseudocounts to transition vector, *           which contains three different probability vectors *           for m, d, and i.  *            * Args:     t     - state transitions, counts: 3 for M, 2 for I, 2 for D.    *           prior - Dirichlet prior information *           tq    - prior distribution over Dirichlet components. *                   (overrides prior->iq[]; used for alternative *                   methods of conditioning prior on structural data)   *            * Return:   (void) *           t is changed, and renormalized -- comes back as *           probability vectors. */          voidP7PriorifyTransitionVector(float *t, struct p7prior_s *prior, 			   float tq[MAXDCHLET]){  int   ts;  int   q;  float mix[MAXDCHLET];  float totm, totd, toti;       /* total counts in three transition vecs */  float xi;                     /* Sjolander's X_i term */  mix[0] = 1.0;			/* default is simple one component */  if ((prior->strategy == PRI_DCHLET || prior->strategy == PRI_PAM) && prior->mnum > 1)    {      for (q = 0; q < prior->tnum; q++)        {          mix[q] =  tq[q] > 0.0 ? log(tq[q]) : -999.;          mix[q] += Logp_cvec(t,   3, prior->t[q]);   /* 3 match  */          mix[q] += Logp_cvec(t+3, 2, prior->t[q]+3); /* 2 insert */	  mix[q] += Logp_cvec(t+5, 2, prior->t[q]+5); /* 2 delete */        }      LogNorm(mix, prior->tnum); /* mix[q] is now P(q | counts) */    }				/* precalc some denominators */  totm = FSum(t,3);		  toti = t[TIM] + t[TII];  totd = t[TDM] + t[TDD];  for (ts = 0; ts < 7; ts++)      {      xi = 0.0;      for (q = 0; q < prior->tnum; q++)        {	  switch (ts) {	  case TMM: case TMI: case TMD: 	    xi += mix[q] * (t[ts] + prior->t[q][ts]) / 	      (totm + FSum(prior->t[q], 3)); 	    break;	  case TIM: case TII: 	    xi += mix[q] * (t[ts] + prior->t[q][ts]) / 	      (toti + prior->t[q][TIM] + prior->t[q][TII]);	    break;	  case TDM: case TDD: 	    xi += mix[q] * (t[ts] + prior->t[q][ts]) / 	      (totd + prior->t[q][TDM] + prior->t[q][TDD]);	    break;	  }        }      t[ts] = xi;    }  FNorm(t,   3);		/* match  */  FNorm(t+3, 2);		/* insert */  FNorm(t+5, 2);		/* delete */}/* Function: default_amino_prior() *  * Purpose:  Set the default protein prior. */static struct p7prior_s *default_amino_prior(void){  struct p7prior_s *pri;  int q, x;				/* default match mixture coefficients */  static float defmq[9] = {    0.178091, 0.056591, 0.0960191, 0.0781233, 0.0834977,     0.0904123, 0.114468, 0.0682132, 0.234585 };				/* default match mixture Dirichlet components */  static float defm[9][20] = {    { 0.270671, 0.039848, 0.017576, 0.016415, 0.014268,       0.131916, 0.012391, 0.022599, 0.020358, 0.030727,       0.015315, 0.048298, 0.053803, 0.020662, 0.023612,      0.216147, 0.147226, 0.065438, 0.003758, 0.009621 },    { 0.021465, 0.010300, 0.011741, 0.010883, 0.385651,       0.016416, 0.076196, 0.035329, 0.013921, 0.093517,       0.022034, 0.028593, 0.013086, 0.023011, 0.018866,       0.029156, 0.018153, 0.036100, 0.071770, 0.419641 },    { 0.561459, 0.045448, 0.438366, 0.764167, 0.087364,      0.259114, 0.214940, 0.145928, 0.762204, 0.247320,      0.118662, 0.441564, 0.174822, 0.530840, 0.465529,       0.583402, 0.445586, 0.227050, 0.029510, 0.121090 },    { 0.070143, 0.011140, 0.019479, 0.094657, 0.013162,       0.048038, 0.077000, 0.032939, 0.576639, 0.072293,       0.028240, 0.080372, 0.037661, 0.185037, 0.506783,       0.073732, 0.071587, 0.042532, 0.011254, 0.028723 },    { 0.041103, 0.014794, 0.005610, 0.010216, 0.153602,       0.007797, 0.007175, 0.299635, 0.010849, 0.999446,       0.210189, 0.006127, 0.013021, 0.019798, 0.014509,       0.012049, 0.035799, 0.180085, 0.012744, 0.026466 },    { 0.115607, 0.037381, 0.012414, 0.018179, 0.051778,       0.017255, 0.004911, 0.796882, 0.017074, 0.285858,       0.075811, 0.014548, 0.015092, 0.011382, 0.012696,       0.027535, 0.088333, 0.944340, 0.004373, 0.016741 },    { 0.093461, 0.004737, 0.387252, 0.347841, 0.010822,       0.105877, 0.049776, 0.014963, 0.094276, 0.027761,       0.010040, 0.187869, 0.050018, 0.110039, 0.038668,       0.119471, 0.065802, 0.025430, 0.003215, 0.018742 },    { 0.452171, 0.114613, 0.062460, 0.115702, 0.284246,      0.140204, 0.100358, 0.550230, 0.143995, 0.700649,       0.276580, 0.118569, 0.097470, 0.126673, 0.143634,       0.278983, 0.358482, 0.661750, 0.061533, 0.199373 },    { 0.005193, 0.004039, 0.006722, 0.006121, 0.003468,       0.016931, 0.003647, 0.002184, 0.005019, 0.005990,       0.001473, 0.004158, 0.009055, 0.003630, 0.006583,       0.003172, 0.003690, 0.002967, 0.002772, 0.002686 },  };  pri = P7AllocPrior();  pri->strategy = PRI_DCHLET;  /* Transition priors are subjective, but borrowed from GJM's estimations   * on Pfam   */  pri->tnum     = 1;  pri->tq[0]    = 1.0;  pri->t[0][TMM]   = 0.7939;  pri->t[0][TMI]   = 0.0278;  pri->t[0][TMD]   = 0.0135;  pri->t[0][TIM]   = 0.1551;  pri->t[0][TII]   = 0.1331;  pri->t[0][TDM]   = 0.9002;  pri->t[0][TDD]   = 0.5630;    /* Match emission priors are a mixture Dirichlet,   * from Kimmen Sjolander (Blocks9)   */  pri->mnum  = 9;  for (q = 0; q < pri->mnum; q++)     {      pri->mq[q] = defmq[q];      for (x = 0; x < 20; x++)	pri->m[q][x] = defm[q][x];    }  /* These insert emission priors are subjective. Observed frequencies   * were obtained from PFAM 1.0, 10 Nov 96;    *      see ~/projects/plan7/InsertStatistics.   * Inserts are slightly biased towards polar residues and away from   * hydrophobic residues.   */  pri->inum  = 1;  pri->iq[0] = 1.;  pri->i[0][0]  = 681.;         /* A */  pri->i[0][1]  = 120.;         /* C */  pri->i[0][2]  = 623.;         /* D */  pri->i[0][3]  = 651.;         /* E */  pri->i[0][4]  = 313.;         /* F */  pri->i[0][5]  = 902.;         /* G */  pri->i[0][6]  = 241.;         /* H */  pri->i[0][7]  = 371.;         /* I */  pri->i[0][8]  = 687.;         /* K */  pri->i[0][9]  = 676.;         /* L */  pri->i[0][10] = 143.;         /* M */  pri->i[0][11] = 548.;         /* N */  pri->i[0][12] = 647.;         /* P */  pri->i[0][13] = 415.;         /* Q */  pri->i[0][14] = 551.;         /* R */  pri->i[0][15] = 926.;         /* S */  pri->i[0][16] = 623.;         /* T */  pri->i[0][17] = 505.;         /* V */  pri->i[0][18] = 102.;         /* W */  pri->i[0][19] = 269.;         /* Y */  return pri;}/* Function: default_nucleic_prior() *  * Purpose:  Set the default DNA prior. (for now, almost a Laplace) */static struct p7prior_s *default_nucleic_prior(void){  struct p7prior_s *pri;  pri = P7AllocPrior();  pri->strategy = PRI_DCHLET;  /* The use of the Pfam-trained amino acid transition priors   * here is TOTALLY bogus. But it works better than a straight   * Laplace, esp. for Maxmodelmaker(). For example, a Laplace    * prior builds M=1 models for a single sequence GAATTC (at   * one time an open "bug").   */  pri->tnum        = 1;  pri->tq[0]       = 1.;  pri->t[0][TMM]   = 0.7939;  pri->t[0][TMI]   = 0.0278;  pri->t[0][TMD]   = 0.0135;  pri->t[0][TIM]   = 0.1551;  pri->t[0][TII]   = 0.1331;  pri->t[0][TDM]   = 0.9002;  pri->t[0][TDD]   = 0.5630;    pri->mnum  = 1;  pri->mq[0] = 1.;  FSet(pri->m[0], Alphabet_size, 1.);  pri->inum  = 1;  pri->iq[0] = 1.;  FSet(pri->i[0], Alphabet_size, 1.);  return pri;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -