⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 modelmakers.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 3 页
字号:
				/* Initialize insert counters to zero */    FSet(insc, Alphabet_size, 0.);    for (idx = 0; idx < msa->nseq; idx++) insopt[idx] = 0;    sc[i] = -FLT_MAX;     for (j = i+1; j <= last; j++) {			/* build transition matrix for column pair i,j */      code = build_cij(msa->aseq, msa->nseq, insopt, i, j, msa->wgt, cij);      if (code == -1) break;	/* no j to our right can work for us */      if (code == 1) {	FCopy(tij, cij, 7);	P7PriorifyTransitionVector(tij, prior, prior->tq); 	FNorm(tij, 3);	tij[TMM] = sreLOG2(tij[TMM] / null_p1); 	tij[TMI] = sreLOG2(tij[TMI] / null_p1); 	tij[TMD] = sreLOG2(tij[TMD]); 	tij[TIM] = sreLOG2(tij[TIM] / null_p1); 	tij[TII] = sreLOG2(tij[TII] / null_p1); 	tij[TDM] = sreLOG2(tij[TDM] / null_p1); 	tij[TDD] = sreLOG2(tij[TDD]);   				/* calculate the score of using this j. */	new = sc[j] +  FDot(tij, cij, 7) + FDot(insp, insc, Alphabet_size);	SQD_DPRINTF2(("%3d %3d new=%6.2f scj=%6.2f m=%6.2f i=%6.2f t=%6.2f\n",	       i, j, new, sc[j], FDot(matp, matc[i], Alphabet_size), 	       FDot(insp, insc, Alphabet_size), FDot(tij, cij, 7)));				/* keep it if it's better */	if (new > sc[i]) {	  sc[i]   = new;	  tbck[i] = j;	}       }				/* bump insc, insopt insert symbol counters */      FAdd(insc, matc[j], Alphabet_size);      for (idx = 0; idx < msa->nseq; idx++)	if (!isgap(msa->aseq[idx][j-1])) insopt[idx]++;    }				/* add in constant contributions for col i */				/* note ad hoc scaling of mpri by wgtsum (us. nseq)*/    sc[i] += FDot(matp, matc[i], Alphabet_size) + mpri * wgtsum;  } /* end loop over start positions i */  /* Termination: place the begin state.   * log odds score for S->N->B is all zero except for NB transition, which   * is a constant. So we only have to evaluate BM transitions.   */  bestsc = -FLT_MAX;  for (i = 1; i <= last; i++) {    new = sc[i];     for (idx = 0; idx < msa->nseq; idx++) {      if (isgap(msa->aseq[idx][j-1])) 	new += bm2;		/* internal B->M transition */      else	new += bm1;		/* B->M1 transition         */    }    if (new > bestsc) {      bestsc = new;      first  = i;    }  }  /* Traceback   */  matassign[0] = 0;  for (i = 1; i <= msa->alen; i++) matassign[i] = ASSIGN_INSERT;   for (i = first; i != 0; i = tbck[i]) {    matassign[i] &= ~ASSIGN_INSERT;    matassign[i] |= ASSIGN_MATCH;  }  /* Hand matassign off for remainder of model construction   */  /*  print_matassign(matassign, ainfo->alen); */  matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr);  /* Clean up.   */  for (i = 1; i <= msa->alen; i++) free(matc[i]);  free(matc);  free(sc);  free(tbck);  free(matassign);  free(insopt);}/* Function: build_cij() *  * Purpose:  Construct a counts vector for transitions between *           column i and column j in a multiple alignment. * *           '_' gap characters indicate "external" gaps which *           are to be dealt with by B->M and M->E transitions.  *           These characters must be placed by a preprocessor. * *           insopt is an "insert optimization" -- an incrementor *           which keeps track of the number of insert symbols *           between i and j. *        * Args:     aseqs  - multiple alignment. [0.nseq-1][0.alen-1] *           nseq   - number of seqs in aseqs *           insopt - number of inserts per seq between i/j [0.nseq-1] *           i      - i column [1.alen], off by one from aseqs *           j      - j column [1.alen], off by one from aseqs *           wgt    - per-seq weights [0.nseq-1] *           cij    - transition count vectors [0..7] *            * Return:   -1 if an illegal transition was seen for this i/j assignment *and* *              we are guaranteed that any j to the right will also *              have illegal transitions. *           0  if an illegal transition was seen, but a j further to the *              right may work.   *           1 if all transitions were legal. */          static int build_cij(char **aseqs, int nseq, int *insopt, int i, int j,          float *wgt, float *cij){  int idx;			/* counter for seqs */  i--;				/* make i,j relative to aseqs [0..alen-1] */  j--;  FSet(cij, 8, 0.);		/* zero cij */  for (idx = 0; idx < nseq; idx++) {    if (insopt[idx] > 0) {      if (isgap(aseqs[idx][i])) return -1; /* D->I prohibited. */      if (isgap(aseqs[idx][j])) return 0;  /* I->D prohibited. */      cij[TMI] += wgt[idx];      cij[TII] += (insopt[idx]-1) * wgt[idx];      cij[TIM] += wgt[idx];    } else {      if (!isgap(aseqs[idx][i])) {	if (aseqs[idx][j] == '_')      ; /* YO! what to do with trailer? */	else if (isgap(aseqs[idx][j])) cij[TMD] += wgt[idx];	else                           cij[TMM] += wgt[idx];      } else {			/* ignores B->E possibility */	if (aseqs[idx][j] == '_')      continue;	else if (isgap(aseqs[idx][j])) cij[TDD] += wgt[idx];	else                           cij[TDM] += wgt[idx];      }    }  }  return 1;}/* Function: estimate_model_length() *  * Purpose:  Return a decent guess about the length of the model, *           based on the lengths of the sequences. *            *           Algorithm is dumb: use weighted average length. *            *           Don't assume that weights sum to nseq! */static intestimate_model_length(MSA *msa){  int   idx;  float total = 0.;  float wgtsum = 0.;  for (idx = 0; idx < msa->nseq; idx++)    {      total  += msa->wgt[idx] * DealignedLength(msa->aseq[idx]);      wgtsum += msa->wgt[idx];    }      return (int) (total / wgtsum);} /* Function: matassign2hmm() *  * Purpose:  Given an assignment of alignment columns to match vs. *           insert, finish the final part of the model construction  *           calculation that is constant between model construction *           algorithms. *            * Args:     msa       - multiple sequence alignment *           dsq       - digitized unaligned aseq's *           matassign - 1..alen bit flags for column assignments *           ret_hmm   - RETURN: counts-form HMM *           ret_tr    - RETURN: array of tracebacks for aseq's *                          * Return:   (void) *           ret_hmm and ret_tr alloc'ed here for the calling *           modelmaker function. */static voidmatassign2hmm(MSA *msa, char **dsq, int *matassign, 	      struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr){  struct plan7_s    *hmm;       /* RETURN: new hmm                     */  struct p7trace_s **tr;        /* fake tracebacks for each seq        */  int      M;                   /* length of new model in match states */  int      idx;                 /* counter over sequences              */  int      apos;                /* counter for aligned columns         */				/* how many match states in the HMM? */  M = 0;  for (apos = 1; apos <= msa->alen; apos++) {    if (matassign[apos] & ASSIGN_MATCH)       M++;  }				/* delimit N-terminal tail */  for (apos=1; matassign[apos] & ASSIGN_INSERT && apos <= msa->alen; apos++)    matassign[apos] |= EXTERNAL_INSERT_N;  if (apos <= msa->alen) matassign[apos] |= FIRST_MATCH;				/* delimit C-terminal tail */  for (apos=msa->alen; matassign[apos] & ASSIGN_INSERT && apos > 0; apos--)    matassign[apos] |= EXTERNAL_INSERT_C;  if (apos > 0) matassign[apos] |= LAST_MATCH;  /* print_matassign(matassign, msa->alen);  */				/* make fake tracebacks for each seq */  fake_tracebacks(msa->aseq, msa->nseq, msa->alen, matassign, &tr);				/* build model from tracebacks */  hmm = AllocPlan7(M);  ZeroPlan7(hmm);  for (idx = 0; idx < msa->nseq; idx++) {    /* P7PrintTrace(stdout, tr[idx], NULL, NULL);   */    P7TraceCount(hmm, dsq[idx], msa->wgt[idx], tr[idx]);  }				/* annotate new model */  annotate_model(hmm, matassign, msa);  /* Set #=RF line of alignment to reflect our assignment   * of match, delete. matassign is valid from 1..alen and is off   * by one from msa->rf.   */  if (msa->rf != NULL) free(msa->rf);  msa->rf = (char *) MallocOrDie (sizeof(char) * (msa->alen + 1));  for (apos = 0; apos < msa->alen; apos++)    msa->rf[apos] = matassign[apos+1] & ASSIGN_MATCH ? 'x' : '.';  msa->rf[msa->alen] = '\0';				/* Cleanup and return. */  if (ret_tr != NULL) *ret_tr = tr;  else   { for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]); free(tr); }  if (ret_hmm != NULL) *ret_hmm = hmm; else FreePlan7(hmm);  return;}  /* Function: fake_tracebacks() *  * Purpose:  From a consensus assignment of columns to MAT/INS, construct fake *           tracebacks for each individual sequence. *            * Note:     Fragment tolerant by default. Internal entries are  *           B->M_x, instead of B->D1->D2->...->M_x; analogously *           for internal exits.  *            * Args:     aseqs     - alignment [0..nseq-1][0..alen-1] *           nseq      - number of seqs in alignment *           alen      - length of alignment in columns *           matassign - assignment of column; [1..alen] (off one from aseqs) *           ret_tr    - RETURN: array of tracebacks *            * Return:   (void) *           ret_tr is alloc'ed here. Caller must free. */          static voidfake_tracebacks(char **aseq, int nseq, int alen, int *matassign,		struct p7trace_s ***ret_tr){  struct p7trace_s **tr;  int  idx;                     /* counter over sequences          */  int  i;                       /* position in raw sequence (1..L) */  int  k;                       /* position in HMM                 */  int  apos;                    /* position in alignment columns   */  int  tpos;			/* position in traceback           */  tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * nseq);    for (idx = 0; idx < nseq; idx++)    {      P7AllocTrace(alen+6, &tr[idx]); /* allow room for S,N,B,E,C,T */      				/* all traces start with S state... */      tr[idx]->statetype[0] = STS;      tr[idx]->nodeidx[0]   = 0;      tr[idx]->pos[0]       = 0;				/* ...and transit to N state; N-term tail				   is emitted on N->N transitions */      tr[idx]->statetype[1] = STN;      tr[idx]->nodeidx[1]   = 0;      tr[idx]->pos[1]       = 0;            i = 1;      k = 0;      tpos = 2;      for (apos = 0; apos < alen; apos++)        {	  tr[idx]->statetype[tpos] = STBOGUS; /* bogus, deliberately, to debug */	  if (matassign[apos+1] & FIRST_MATCH)	    {			/* BEGIN */	      tr[idx]->statetype[tpos] = STB;	      tr[idx]->nodeidx[tpos]   = 0;	      tr[idx]->pos[tpos]       = 0;	      tpos++;	    }	  if (matassign[apos+1] & ASSIGN_MATCH && ! isgap(aseq[idx][apos]))	    {			/* MATCH */	      k++;		/* move to next model pos */	      tr[idx]->statetype[tpos] = STM;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -