📄 hmmbuild.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
  for (k = 1; k < hmm->M; k++)    {      fputs("I ", cfp);      for (x = 0; x < Alphabet_size; x++)	fprintf(cfp, "%8.2f ", hmm->ins[k][x]);      fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);      if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)	fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]);      else	fputs("- -", cfp);      fputs("\n", cfp);    }				/* transition vectors */    for (k = 1; k < hmm->M; k++)    {      fputs("T ", cfp);      for (x = 0; x < 7; x++)	fprintf(cfp, "%8.2f ", hmm->t[k][x]);      fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);      if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)	fprintf(cfp, "%c %c %c %c", 		hmm->cs[k], hmm->cs[k+1], 		hmm->ca[k], hmm->ca[k+1]);      else	fputs("- -", cfp);      fputs("\n", cfp);    }}/* Function: position_average_score() * Date:     Wed Dec 31 09:36:35 1997 [StL] *  * Purpose:  Calculate scores from tracebacks, keeping them *           in a position specific array. The final array *           is normalized position-specifically too, according *           to how many sequences contributed data to this *           position. Used for compensating for sequence  *           fragments in ME and MD score optimization.  *           Very much ad hoc. *            *           Code related to (derived from) TraceScore(). *            * Args:     hmm       - HMM structure, scores valid *           dsq       - digitized unaligned sequences *           wgt       - weights on the sequences *           nseq      - number of sequences *           tr        - array of nseq tracebacks that aligns each dsq to hmm *           pernode   - RETURN: [0]1..M array of position-specific avg scores *           ret_avg   - RETURN: overall average full-length, one-domain score *            * Return:   1 on success, 0 on failure.           *           pernode is malloc'ed [0]1..M by CALLER and filled here. */static voidposition_average_score(struct plan7_s    *hmm, 		       char             **dsq, 		       float             *wgt,		       int                nseq,		       struct p7trace_s **tr,		       float             *pernode,		       float             *ret_avg){  int    pos;                   /* position in seq */  int    sym;  int    tpos;                  /* position in trace/state sequence */  float *counts;                /* counts at each position */  float  avg;                   /* RETURN: average overall */  int    k;                     /* counter for model position */  int    idx;                   /* counter for sequence number */  /* Allocations   */  counts = MallocOrDie ((hmm->M+1) * sizeof(float));  FSet(pernode, hmm->M+1, 0.);  FSet(counts,  hmm->M+1, 0.);  /* Loop over traces, accumulate weighted scores per position   */  for (idx = 0; idx < nseq; idx++)    for (tpos = 0; tpos < tr[idx]->tlen; tpos++)      {	pos = tr[idx]->pos[tpos];	sym = (int) dsq[idx][tr[idx]->pos[tpos]];	k   = tr[idx]->nodeidx[tpos];	/* Counts: how many times did we use this model position 1..M?         * (weighted)	 */	if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD)	  counts[k] += wgt[idx];	/* Emission scores.	 */	if (tr[idx]->statetype[tpos] == STM) 	  pernode[k] += wgt[idx] * Scorify(hmm->msc[sym][k]);	else if (tr[idx]->statetype[tpos] == STI) 	  pernode[k] += wgt[idx] * Scorify(hmm->isc[sym][k]);		/* Transition scores.	 */	if (tr[idx]->statetype[tpos] == STM ||	    tr[idx]->statetype[tpos] == STD ||	    tr[idx]->statetype[tpos] == STI)	  pernode[k] += wgt[idx] * 	    Scorify(TransitionScoreLookup(hmm, tr[idx]->statetype[tpos], tr[idx]->nodeidx[tpos],					  tr[idx]->statetype[tpos+1],tr[idx]->nodeidx[tpos+1]));      }  /* Divide accumulated scores by accumulated weighted counts   */  avg = 0.;  for (k = 1; k <= hmm->M; k++)    {      pernode[k] /= counts[k];      avg += pernode[k];    }    free(counts);  *ret_avg = avg;  return;}/* Function: frag_trace_score() * Date:     SRE, Wed Dec 31 10:03:47 1997 [StL] *  * Purpose:  Allow MD/ME optimization to be used for alignments *           that include fragments and multihits -- estimate a full-length *           per-domain score. *            * *            * Return:   "corrected" score. */static floatfrag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr,                  float *pernode, float expected){  float sc;			/* corrected score  */  float fragexp;		/* expected score for a trace like this */  int   tpos;			/* position in trace */                                /* get uncorrected score */  sc = P7TraceScore(hmm, dsq, tr);                               /* calc expected score for trace like this */  fragexp = 0.;  for (tpos = 0; tpos < tr->tlen; tpos++)    if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD)      fragexp += pernode[tr->nodeidx[tpos]];				/* correct for multihits */  fragexp /= (float) TraceDomainNumber(tr);                                /* extrapolate to full-length, one-hit score */  sc = sc * expected / fragexp;  return sc;}/* Function: maximum_entropy() * Date:     SRE, Fri Jan  2 10:56:00 1998 [StL] *  * Purpose:  Optimizes a model according to maximum entropy weighting. *           See Krogh and Mitchison (1995). * *           [Actually, we do minimum relative entropy, rather than *           maximum entropy. Same thing, though we refer to "ME" *           weights and models. The optimization is a steepest *           descents minimization of the relative entropy.] *            *           Expects to be called shortly after a Maxmodelmaker() *           or Handmodelmaker(), so that both a new model architecture *           (with MAP parameters) and fake tracebacks are available. *            *           Prints a summary of optimization progress to stdout. *            * Args:     hmm     - model. allocated, set with initial MAP parameters. *           dsq     - dealigned digitized seqs the model is based on *           ainfo   - extra info for aseqs *           nseq    - number of aseqs *           eff_nseq- effective sequence number; weights normalize up to this. *           prior   - prior distributions for parameterizing model *           tr      - array of fake traces for each sequence         *            * Return:   (void) *           hmm changed to an ME HMM *           ainfo changed, contains ME weights           */static voidmaximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa,		float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr){  float *wgt;                  /* current best set of ME weights   */  float *new_wgt;              /* new set of ME weights to try     */  float *sc;                    /* log-odds score of each sequence */  float *grad;                  /* gradient                        */  float  epsilon;               /* steepness of descent            */  float  relative_entropy;      /* current best relative entropy   */  float  new_entropy;           /* relative entropy at new weights */  float  last_new_entropy;      /* last new_entropy we calc'ed     */  float  use_epsilon;           /* current epsilon value in use    */  int    idx;                   /* counter over sequences          */  int    i1, i2;		/* counters for iterations         */  float  converge_criterion;  float  minw, maxw;            /* min, max weight                 */  int    posw, highw;           /* number of positive weights      */  float  mins, maxs, avgs;      /* min, max, avg score             */  float *pernode;               /* expected score per node of HMM  */  float  expscore;              /* expected score of complete HMM  */  int    max_iter;		/* bulletproof against infinite loop bugs */  epsilon  = 0.2;                /* works fine */  max_iter = 666;    /* Allocations   */  sc      = MallocOrDie (sizeof(float) * msa->nseq);  wgt     = MallocOrDie (sizeof(float) * msa->nseq);  new_wgt = MallocOrDie (sizeof(float) * msa->nseq);  grad    = MallocOrDie (sizeof(float) * msa->nseq);  pernode = MallocOrDie (sizeof(float) * (hmm->M+1));  /* Initialization. Start with all weights == 1.0.   * Find relative entropy and gradient.   */  Plan7SWConfig(hmm, 0.5, 0.5);  P7Logoddsify(hmm, TRUE);  FSet(wgt, msa->nseq, 1.0);  position_average_score(hmm, dsq, wgt, msa->nseq, tr, pernode,&expscore);  for (idx = 0; idx < msa->nseq; idx++)     sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);  relative_entropy = FSum(sc, msa->nseq) / (float) msa->nseq;  for (idx = 0; idx < msa->nseq; idx++)    grad[idx] = relative_entropy - sc[idx];    printf("iter avg-sc min-sc max-sc min-wgt max-wgt +wgt ++wgt rel.ent convergence\n");  printf("---- ------ ------ ------ ------- ------- ---- ----- ------- -----------\n");  mins = maxs = avgs = sc[0];  for (idx = 1; idx < msa->nseq; idx++)    {      if (sc[idx] < mins) mins = sc[idx];      if (sc[idx] > maxs) maxs = sc[idx];      avgs += sc[idx];    }  avgs /= (float) msa->nseq;  printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8s\n",         0, avgs, mins, maxs, 1.0, 1.0, msa->nseq, 0, relative_entropy, "-");    /* Steepest descents optimization;   * iterate until relative entropy converges.   */  i1 = 0;  while (++i1 < max_iter)    {      /* Gradient gives us a line of steepest descents.       * (Roughly speaking, anyway. We actually have a constraint       * that weights are nonnegative and normalized, and the       * gradient doesn't take these into account.)       * Look along this line, a distance of epsilon * gradient:       * if new point is better, accept; if new point is worse,       * move back along the line by half the distance and re-evaluate.       */      use_epsilon = epsilon;      new_entropy = relative_entropy + 1.0;    /* just ensure new > old */      i2 = 0;       while (new_entropy > relative_entropy && ++i2 < max_iter)        {          last_new_entropy = new_entropy;                                /* find a new point in weight space */          for (idx = 0; idx < msa->nseq; idx++)            {              new_wgt[idx] = wgt[idx] + use_epsilon * grad[idx];              if (new_wgt[idx] < 0.) new_wgt[idx] = 0.0;            }          FNorm(new_wgt, msa->nseq);          FScale(new_wgt, msa->nseq, (float) msa->nseq);                                /* Make new HMM using these weights */          ZeroPlan7(hmm);          for (idx = 0; idx < msa->nseq; idx++)            P7TraceCount(hmm, dsq[idx], new_wgt[idx], tr[idx]);          P7PriorifyHMM(hmm, prior);                                  /* Evaluate new point */	  Plan7SWConfig(hmm, 0.5, 0.5);	  P7Logoddsify(hmm, TRUE);	  position_average_score(hmm, dsq, new_wgt, msa->nseq, tr, pernode, &expscore);          for (idx = 0; idx < msa->nseq; idx++) 	    sc[idx]      = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);	  new_entropy = FDot(sc, new_wgt, msa->nseq) / (float) msa->nseq;          use_epsilon /= 2.0;	  /* Failsafe: we're not converging. Set epsilon to zero,	   * do one more round.	   */	  if (use_epsilon < 1e-6) use_epsilon = 0.0; 	  if (use_epsilon == 0.0) break;                    /* Failsafe: avoid infinite loops. Sometimes the             new entropy converges without ever being better              than the previous point, probably as a result             of minor roundoff error. */          if (last_new_entropy == new_entropy) break;        }      if (i2 == max_iter) printf("   -- exceeded maximum iterations; giving up --\n");      /* Evaluate convergence before accepting the new weights;       * then, accept the new point and evaluate the gradient there.       */      converge_criterion = fabs((relative_entropy-new_entropy)/relative_entropy);      relative_entropy = new_entropy;      FCopy(wgt, new_wgt, msa->nseq);      for (idx = 0; idx < msa->nseq; idx++)	grad[idx] = relative_entropy - sc[idx];      /* Print some statistics about this iteration       */      mins = maxs = avgs = sc[0];      minw = maxw = wgt[0];      posw = (wgt[0] > 0.0) ? 1 : 0;      highw = (wgt[0] > 1.0) ? 1 : 0;      for (idx = 1; idx < msa->nseq; idx++)        {          if (sc[idx] < mins) mins = sc[idx];          if (sc[idx] > maxs) maxs = sc[idx];          if (wgt[idx] < minw) minw = wgt[idx];          if (wgt[idx] > maxw) maxw = wgt[idx];          if (wgt[idx] > 0.0)  posw++;          if (wgt[idx] > 1.0)  highw++;          avgs += sc[idx];        }      avgs /= (float) msa->nseq;      printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8.5f\n",             i1,              avgs, mins, maxs,              minw, maxw, posw, highw,             relative_entropy, converge_criterion);            if (converge_criterion < 1e-5) break;    }  if (i1 == max_iter) printf("   -- exceeded maximum iterations; giving up --\n");  /* Renormalize weights to sum to eff_nseq, and save.   */  FNorm(wgt, msa->nseq);  FScale(wgt, msa->nseq, (float) eff_nseq);  FCopy(msa->wgt, wgt, msa->nseq);			/* Make final HMM using these adjusted weights */  ZeroPlan7(hmm);  for (idx = 0; idx < msa->nseq; idx++)    P7TraceCount(hmm, dsq[idx], wgt[idx], tr[idx]);  P7PriorifyHMM(hmm, prior);                                  /* Cleanup and return   */  free(pernode);  free(new_wgt);  free(grad);  free(wgt);  free(sc);  return;}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -