⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 emit.c

📁 hmmer源程序
💻 C
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. ************************************************************//* emit.c * SRE, Sun Mar  8 12:26:58 1998 * RCS $Id: emit.c,v 1.5 1999/05/02 22:03:47 eddy Exp $ *  * Generation of sequences/traces from an HMM. */#include "structs.h"#include "config.h"#include "funcs.h"#include "squid.h"#include <ctype.h>#ifdef MEMDEBUG#include "dbmalloc.h"#endif/* Function: EmitSequence() * Date:     SRE, Sun Mar  8 12:28:03 1998 [St. Louis] * * Purpose:  Given a model, sample a sequence and/or traceback. * * Args:     hmm     - the model *           ret_dsq - RETURN: generated digitized sequence (pass NULL if unwanted) *           ret_L   - RETURN: length of generated sequence  *           ret_tr  - RETURN: generated trace (pass NULL if unwanted) * * Returns:  void */voidEmitSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr){  struct p7trace_s *tr;  char  type;               	/* current state type */  int   k;			/* current node index */  char *dsq;                    /* generated sequence, digitized */  int   L;			/* length of sequence */  int   alloc_tlen;		/* allocated space for traceback */  int   alloc_L;		/* allocated space for sequence  */  int   tpos;			/* position in traceback */  int   sym;			/* a generated symbol index */  float t[4];			/* little array for choosing M transition from */    /* Initialize; allocations   */  P7AllocTrace(64, &tr);  alloc_tlen = 64;  dsq = MallocOrDie(sizeof(char) * 64);  alloc_L = 64;  TraceSet(tr, 0, STS, 0, 0);  TraceSet(tr, 1, STN, 0, 0);  dsq[0] = (char) Alphabet_iupac;  L      = 1;  k      = 0;  type   = STN;  tpos   = 2;  while (type != STT)     {      /* Deal with state transition       */      switch (type) {      case STB:	hmm->begin[0] = hmm->tbd1; /* begin[0] hack (documented in structs.h) */	k = FChoose(hmm->begin, hmm->M+1);	if (k == 0) { type = STD; k = 1; } else {type = STM; }	break;      case STI:	type = (FChoose(hmm->t[k]+TIM, 2) == 0)    ? STM : STI; if (type == STM) k++; break;      case STN: type = (FChoose(hmm->xt[XTN], 2)  == LOOP) ? STN : STB; k = 0; break;      case STE:	type = (FChoose(hmm->xt[XTE], 2)  == LOOP) ? STJ : STC; k = 0; break;      case STC:	type = (FChoose(hmm->xt[XTC], 2)  == LOOP) ? STC : STT; k = 0; break;      case STJ:	type = (FChoose(hmm->xt[XTJ], 2)  == LOOP) ? STJ : STB; k = 0; break;      case STD:		if (k < hmm->M) {	  type = (FChoose(hmm->t[k]+TDM, 2) == 0) ? STM : STD; 	  k++;   	} else {	  type = STE;	  k = 0;	}	break;      case STM:	if (k < hmm->M) {	  FCopy(t, hmm->t[k], 3);	  t[3] = hmm->end[k];	  switch (FChoose(t,4)) {	  case 0: k++;  type = STM; break;	  case 1:       type = STI; break;	  case 2: k++;  type = STD; break;	  case 3: k=0;  type = STE; break;	  default: Die("never happens");	  }	} else {	  k    = 0;	  type = STE;	}	break;      case STT:      case STBOGUS:      default:	Die("can't happen.");      }        /* Choose a symbol emission, if necessary       */      sym = -1;      if      (type == STM) sym = FChoose(hmm->mat[k], Alphabet_size);      else if (type == STI) sym = FChoose(hmm->ins[k], Alphabet_size);       else if ((type == STN && tr->statetype[tpos-1] == STN) ||	       (type == STC && tr->statetype[tpos-1] == STC) ||	       (type == STJ && tr->statetype[tpos-1] == STJ))	sym = FChoose(hmm->null, Alphabet_size);	      /* Add to the traceback; deal with realloc if necessary       */      TraceSet(tr, tpos, type, k, (sym != -1) ? L : 0);      tpos++;      if (tpos == alloc_tlen) {	alloc_tlen += 64; 	P7ReallocTrace(tr, alloc_tlen);      }      /* Add to the digitized seq; deal with realloc, if necessary       */      if (sym != -1) {	dsq[L] = (char) sym;	L++;	if (L+1 == alloc_L) {	/* L+1 leaves room for sentinel byte + \0 */	  alloc_L += 64;	  dsq = ReallocOrDie(dsq, sizeof(char) * alloc_L);	}      }    }    /* Finish off the trace   */   tr->tlen = tpos;  /* Finish off the dsq with sentinel byte and null terminator.   * Emitted Sequence length is L-1.   */  dsq[L]   = (char) Alphabet_iupac;  dsq[L+1] = '\0';  L--;  /* Return   */  if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);  if (ret_L   != NULL) *ret_L   = L;  if (ret_tr  != NULL) *ret_tr  = tr;  else P7FreeTrace(tr);  return;}#ifdef SRE_REMOVED/* Function: EmitBestSequence() * Date:     SRE, Tue Nov 10 16:21:59 1998 [St. Louis] * * Purpose:  Given a model, emit the maximum probability sequence *           from it: argmax_{seq} P(seq | model). *           This is a sensible HMM equivalent to a "consensus" *           sequence. *           The model should be Plan7NakedConfig()'ed;  *           in particular, if we allowed B->M and M->E, *           the highest probability sequence would be *           artifactually short. (We could do the highest *           scoring sequence instead, to get around this problem, *           but the highest scoring sequence is prone to *           other artifacts -- any looping state N,C,J, or I *           with a positively scoring residue leads to *           an infinitely long "best scoring" sequence.) * * Args:     hmm     - the model *           ret_seq - RETURN: best sequence *           ret_L   - RETURN: length of sequence *           ret_tr  - RETURN: traceback of the model/seq alignment; or NULL. * * Returns:  void */voidEmitBestSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr){  char              *seq;                  /* RETURN: best seq */  struct p7trace_s  *tr;                   /* RETURN: traceback */  float             *mmx, *imx, *dmx;      /* log P forward scores for M,D,I */  char              *mtb, *itb, *dtb;      /* traceback ptrs for M,D,I */  int  x;		        /* counter for symbols */  int  k;			/* counter for nodes   */  float sc;			/* tmp var for a log P */  int  bestsym;  int  rpos;			/* position in a sequence */  int  tpos;			/* position in a trace */  int  tlen;			/* length of the traceback */  /* Initial allocations. We only need a 1D matrix and its shadow;   * it's overkill to use the Plan7Matrix structures, so don't.   */  mmx = MallocOrDie(sizeof(float) * (hmm->M+1));  imx = MallocOrDie(sizeof(float) * (hmm->M));  dmx = MallocOrDie(sizeof(float) * (hmm->M));  mtb = MallocOrDie(sizeof(char)  * (hmm->M+1));  itb = MallocOrDie(sizeof(char)  * (hmm->M));  dtb = MallocOrDie(sizeof(char)  * (hmm->M));  /* Initialization.    * We can safely assume a max probability path of S->N->B->(M1 or D1),   * so just init M1 and D1.   */  mmx[1] = log(hmm->xt[XTN][MOVE]) + log(1.F - hmm->tbd1);  dmx[1] =   /* Main recursion, done as a push.   * The model is used in probability form; no wing folding needed.   */  for (k = 1; k < hmm->M; k++)    {      /* Transits out of match state (init with these)       */      mmx[k+1] = mmx[k] + log(hmm->t[k][TMM]); mtb[k+1] = STM;      dmx[k+1] = mmx[k] + log(hmm->t[k][TMD]); dtb[k+1] = STM;      if (k < hmm->M-1) 	imx[k]   = mmx[k] + log(hmm->t[k][TMI]); itb[k]   = STM;            /* Transits out of delete state       */      if ((sc = dmx[k] + log(hmm->t[k][TDM])) > mmx[k+1]) 	{ mmx[k+1] = sc; mtb[k+1] = STD; }      if ((sc = dmx[k] + log(hmm->t[k][TDD])) > dmx[k+1])	{ dmx[k+1] = sc; dtb[k+1] = STD; }      /* Transits out of insert state (self-loops are never good)       */      if ((sc = imx[k] + log(hmm->t[k][TIM])) > mmx[k+1])	{ mmx[k+1] = sc; mtb[k+1] = STI; }            /* Best emissions       */      x = FMax(hmm->mat[k+1], Alphabet_size);      mmx[k+1] += log(hmm->mat[k+1][x]);      if (k < hmm->M-1) {	x = FMax(hmm->ins[k+1], Alphabet_size);	imx[k+1] += log(hmm->ins[k+1][x]);      }    }}#endif /* SRE_REMOVED *//* Function: EmitConsensusSequence() * Date:     SRE, Wed Nov 11 11:08:59 1998 [St. Louis] * * Purpose:  Generate a "consensus sequence". For the purposes *           of a profile HMM, this is defined as: *              - for each node: *                 - if StateOccupancy() says that M is used  *                     with probability >= 0.5, this M is "consensus". *                     Then, choose maximally likely residue. *                     if P>0.5 (protein) or P>0.9 (DNA), make *                     it upper case; else make it lower case.  *                 - if StateOccupancy() says that I *                     is used with P >= 0.5, this I is "consensus"; *                     use it 1/(1-TII) times (its expectation value). *                     Generate an "x" from each I. *                      *           The function expects that the model is config'ed *           by Plan7NakedConfig(): that is, for a single global pass *           with no N,C,J involvement. *                      * * Args:     hmm     - the model *           ret_seq - RETURN: consensus sequence (pass NULL if unwanted) *           ret_dsq - RETURN: digitized consensus sequence (pass NULL if unwanted) *           ret_L   - RETURN: length of generated sequence  *           ret_tr  - RETURN: generated trace (pass NULL if unwanted) * * Returns:  void         */voidEmitConsensusSequence(struct plan7_s *hmm, char **ret_seq, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr){  struct p7trace_s *tr;         /* RETURN: traceback */  char *dsq, *seq;              /* sequence in digitized and undigitized form */  float *mp, *ip, *dp;          /* state occupancies from StateOccupancy() */  int    nmat, ndel, nins;	/* number of matches, deletes, inserts used */  int    k;			/* counter for nodes */  int    tpos;			/* position in trace */  int    i;                     /* position in seq (equiv pos in dsq is i+1 */  int    x;			/* symbol choice (M) or # symbols (I) */  float  mthresh;		/* >= this, show symbol as upper case */  if (Alphabet_type == hmmAMINO) mthresh = 0.5;  else                           mthresh = 0.9;  StateOccupancy(hmm, &mp, &ip, &dp);  /* First pass: how many states do we need in the trace?   *             how long will the sequence be?   */  nmat = ndel = nins = 0;  for (k = 1; k <= hmm->M; k++)    {      if (mp[k] >= 0.5) nmat++; else ndel++;      if (k < hmm->M && ip[k] >= 0.5) 	nins += (int) (1.f / (1.f - hmm->t[k][TII]));    }    /* Allocations   */  P7AllocTrace(6 + nmat + ndel + nins, &tr);  dsq = MallocOrDie(sizeof(char) * (nmat+nins+3));  seq = MallocOrDie(sizeof(char) * (nmat+nins+1));  /* Main pass.   * Construct consensus trace, seq, and dsq.   */  TraceSet(tr, 0, STS, 0, 0);  TraceSet(tr, 1, STN, 0, 0);  TraceSet(tr, 2, STB, 0, 0);  dsq[0] = Alphabet_iupac;	/* guard byte */  tpos = 3;  i    = 0;  for (k = 1; k <= hmm->M; k++)    {      if (mp[k] >= 0.5)	{	  x = FMax(hmm->mat[k], Alphabet_size);	  TraceSet(tr, tpos, STM, k, i+1);	  seq[i]   = Alphabet[x];	  dsq[i+1] = x;	  if (hmm->mat[k][x] < mthresh)	    seq[i] = tolower((int) seq[i]);	  i++;	  tpos++;	}      else	{	  TraceSet(tr, tpos, STD, k, 0);	  tpos++;	}      if (k < hmm->M && ip[k] >= 0.5)	{	  x = (int) (1.f / (1.f - hmm->t[k][TII]));	  while (x--) 	    {	      TraceSet(tr, tpos, STI, k, i+1);	      seq[i]   = 'x';	      dsq[i+1] = Alphabet_iupac - 1;	      i++; 	      tpos++;	    }	}    }  TraceSet(tr, tpos, STE, 0, 0); tpos++;  TraceSet(tr, tpos, STC, 0, 0); tpos++;  TraceSet(tr, tpos, STT, 0, 0); tpos++;  dsq[i+1] = Alphabet_iupac;      free(mp);  free(ip);  free(dp);  if (ret_seq != NULL) *ret_seq = seq; else free(seq);  if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);  if (ret_L   != NULL) *ret_L   = i;     if (ret_tr  != NULL) *ret_tr  = tr;  else P7FreeTrace(tr);}/* Function: StateOccupancy() * Date:     SRE, Wed Nov 11 09:46:15 1998 [St. Louis] * * Purpose:  Calculate the expected state occupancy for *           a given HMM in generated traces. *            *           Note that expected prob of getting into *           any special state in a trace is trivial: *              S,N,B,E,C,T = 1.0 *              J = E->J transition prob  * * Args:     hmm    - the model *           ret_mp - RETURN: [1..M] prob's of occupying M *           ret_ip - RETURN: [1..M-1] prob's of occupying I *           ret_dp - RETURN: [1..M] prob's of occupying D * * Returns:  void *           mp, ip, dp are malloc'ed here. Caller must free(). */voidStateOccupancy(struct plan7_s *hmm, float **ret_mp, float **ret_ip, float **ret_dp){  float *fmp, *fip, *fdp;       /* forward probabilities  */  int k;			/* counter for nodes      */  /* Initial allocations   */  fmp = MallocOrDie (sizeof(float) * (hmm->M+1));  fip = MallocOrDie (sizeof(float) * (hmm->M));  fdp = MallocOrDie (sizeof(float) * (hmm->M+1));    /* Forward pass.    */  fdp[1] = hmm->tbd1;  fmp[1] = hmm->begin[1];  fip[1] = fmp[1] * hmm->t[1][TMI];  for (k = 2; k <= hmm->M; k++)    {			/* M: from M,D,I at k-1, or B; count t_II as 1.0 */      fmp[k] = fmp[k-1] * hmm->t[k-1][TMM] +	       fip[k-1] +               fdp[k-1] * hmm->t[k-1][TDM] +	       hmm->begin[k];			/* D: from M,D at k-1 */      fdp[k] = fmp[k-1] * hmm->t[k-1][TMD] +	       fdp[k-1] * hmm->t[k-1][TDD];			/* I: from M at k; don't count II */      if (k < hmm->M) {	fip[k] = fmp[k] * hmm->t[k][TMI];      }      SQD_DASSERT2((fabs(1.0f - fmp[k] - fdp[k]) < 1e-6f));      fmp[k] /= fmp[k]+fdp[k];	/* prevent propagating fp errors */      fdp[k] /= fmp[k]+fdp[k];    }  /* We don't need a backward pass; all backwards P's are 1.0   * by definition (you can always get out of a state with P=1).   * The only situation where this might not be true is for   * a TII of 1.0, when TIM = 0 -- but in that case, if there's   * a finite chance of getting into that insert state, the model   * generates infinitely long sequences, so we can consider this   * situation "perverse" and disallow it elsewhere in building   * profile HMMs.   */  /* Return.   */  *ret_mp = fmp;  *ret_dp = fdp;  *ret_ip = fip;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -