emit.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 366 行

C
366
字号
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-2003 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. ************************************************************//* emit.c * SRE, Sun Mar  8 12:26:58 1998 * CVS $Id: emit.c,v 1.9 2003/10/04 18:26:49 eddy Exp $ *  * Generation of sequences/traces from an HMM. */#include "config.h"#include "squidconf.h"#include <ctype.h>#include "structs.h"#include "funcs.h"#include "squid.h"/* Function: EmitSequence() * Date:     SRE, Sun Mar  8 12:28:03 1998 [St. Louis] * * Purpose:  Given a model, sample a sequence and/or traceback. * * Args:     hmm     - the model *           ret_dsq - RETURN: generated digitized sequence (pass NULL if unwanted) *           ret_L   - RETURN: length of generated sequence  *           ret_tr  - RETURN: generated trace (pass NULL if unwanted) * * Returns:  void */voidEmitSequence(struct plan7_s *hmm, unsigned char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr){  struct p7trace_s *tr;  unsigned char    *dsq;        /* generated sequence, digitized */  char  type;               	/* current state type */  int   k;			/* current node index */  int   L;			/* length of sequence */  int   alloc_tlen;		/* allocated space for traceback */  int   alloc_L;		/* allocated space for sequence  */  int   tpos;			/* position in traceback */  unsigned char sym;		/* a generated symbol index */  float t[4];			/* little array for choosing M transition from */    /* Initialize; allocations   */  P7AllocTrace(64, &tr);  alloc_tlen = 64;  dsq = MallocOrDie(sizeof(unsigned char) * 64);  alloc_L = 64;  TraceSet(tr, 0, STS, 0, 0);  TraceSet(tr, 1, STN, 0, 0);  dsq[0] = Alphabet_iupac;  L      = 1;  k      = 0;  type   = STN;  tpos   = 2;  while (type != STT)     {      /* Deal with state transition       */      switch (type) {      case STB:	hmm->begin[0] = hmm->tbd1; /* begin[0] hack (documented in structs.h) */	k = FChoose(hmm->begin, hmm->M+1);	if (k == 0) { type = STD; k = 1; } else {type = STM; }	break;      case STI:	type = (FChoose(hmm->t[k]+TIM, 2) == 0)    ? STM : STI; if (type == STM) k++; break;      case STN: type = (FChoose(hmm->xt[XTN], 2)  == LOOP) ? STN : STB; k = 0; break;      case STE:	type = (FChoose(hmm->xt[XTE], 2)  == LOOP) ? STJ : STC; k = 0; break;      case STC:	type = (FChoose(hmm->xt[XTC], 2)  == LOOP) ? STC : STT; k = 0; break;      case STJ:	type = (FChoose(hmm->xt[XTJ], 2)  == LOOP) ? STJ : STB; k = 0; break;      case STD:		if (k < hmm->M) {	  type = (FChoose(hmm->t[k]+TDM, 2) == 0) ? STM : STD; 	  k++;   	} else {	  type = STE;	  k = 0;	}	break;      case STM:	if (k < hmm->M) {	  FCopy(t, hmm->t[k], 3);	  t[3] = hmm->end[k];	  switch (FChoose(t,4)) {	  case 0: k++;  type = STM; break;	  case 1:       type = STI; break;	  case 2: k++;  type = STD; break;	  case 3: k=0;  type = STE; break;	  default: Die("never happens");	  }	} else {	  k    = 0;	  type = STE;	}	break;      case STT:      case STBOGUS:      default:	Die("can't happen.");      }        /* Choose a symbol emission, if necessary       */      sym = Alphabet_iupac;	/* use sentinel byte to mean "not set yet" */      if      (type == STM) sym = FChoose(hmm->mat[k], Alphabet_size);      else if (type == STI) sym = FChoose(hmm->ins[k], Alphabet_size);       else if ((type == STN && tr->statetype[tpos-1] == STN) ||	       (type == STC && tr->statetype[tpos-1] == STC) ||	       (type == STJ && tr->statetype[tpos-1] == STJ))	sym = FChoose(hmm->null, Alphabet_size);	      /* Add to the traceback; deal with realloc if necessary       */      TraceSet(tr, tpos, type, k, (sym != Alphabet_iupac) ? L : 0);      tpos++;      if (tpos == alloc_tlen) {	alloc_tlen += 64; 	P7ReallocTrace(tr, alloc_tlen);      }      /* Add to the digitized seq; deal with realloc, if necessary       */      if (sym != Alphabet_iupac) {	dsq[L] = sym;	L++;	if (L == alloc_L) {		  alloc_L += 64;	  dsq = ReallocOrDie(dsq, sizeof(unsigned char) * alloc_L);	}      }    }    /* Finish off the trace   */   tr->tlen = tpos;  /* Finish off the dsq with sentinel byte and null terminator.   * Emitted sequence length is L-1. No, this isn't a bug - L is   * the index of the next position, and since we're ending, the   * sequence is L-1 and the sentinel goes in position L.   */  dsq[L]   = Alphabet_iupac;  L--;  /* Return   */  if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);  if (ret_L   != NULL) *ret_L   = L;  if (ret_tr  != NULL) *ret_tr  = tr;  else P7FreeTrace(tr);  return;}/* Function: EmitConsensusSequence() * Date:     SRE, Wed Nov 11 11:08:59 1998 [St. Louis] * * Purpose:  Generate a "consensus sequence". For the purposes *           of a profile HMM, this is defined as: *              - for each node: *                 - if StateOccupancy() says that M is used  *                     with probability >= 0.5, this M is "consensus". *                     Then, choose maximally likely residue. *                     if P>0.5 (protein) or P>0.9 (DNA), make *                     it upper case; else make it lower case.  *                 - if StateOccupancy() says that I *                     is used with P >= 0.5, this I is "consensus"; *                     use it 1/(1-TII) times (its expectation value). *                     Generate an "x" from each I. *                      *           The function expects that the model is config'ed *           by Plan7NakedConfig(): that is, for a single global pass *           with no N,C,J involvement. *                      * * Args:     hmm     - the model *           ret_seq - RETURN: consensus sequence (pass NULL if unwanted) *           ret_dsq - RETURN: digitized consensus sequence (pass NULL if unwanted) *           ret_L   - RETURN: length of generated sequence  *           ret_tr  - RETURN: generated trace (pass NULL if unwanted) * * Returns:  void         */voidEmitConsensusSequence(struct plan7_s *hmm, char **ret_seq, unsigned char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr){  struct p7trace_s *tr;         /* RETURN: traceback */  unsigned char    *dsq;        /* RETURN: sequence in digitized form */  char             *seq;        /* RETURN: sequence in undigitized form */  float *mp, *ip, *dp;          /* state occupancies from StateOccupancy() */  int    nmat, ndel, nins;	/* number of matches, deletes, inserts used */  int    k;			/* counter for nodes */  int    tpos;			/* position in trace */  int    i;                     /* position in seq (equiv pos in dsq is i+1 */  int    x;			/* symbol choice (M) or # symbols (I) */  float  mthresh;		/* >= this, show symbol as upper case */  if (Alphabet_type == hmmAMINO) mthresh = 0.5;  else                           mthresh = 0.9;  StateOccupancy(hmm, &mp, &ip, &dp);  /* First pass: how many states do we need in the trace?   *             how long will the sequence be?   */  nmat = ndel = nins = 0;  for (k = 1; k <= hmm->M; k++)    {      if (mp[k] >= 0.5) nmat++; else ndel++;      if (k < hmm->M && ip[k] >= 0.5) 	nins += (int) (1.f / (1.f - hmm->t[k][TII]));    }    /* Allocations   */  P7AllocTrace(6 + nmat + ndel + nins, &tr);  dsq = MallocOrDie(sizeof(unsigned char) * (nmat+nins+3));  seq = MallocOrDie(sizeof(char) * (nmat+nins+1));  /* Main pass.   * Construct consensus trace, seq, and dsq.   */  TraceSet(tr, 0, STS, 0, 0);  TraceSet(tr, 1, STN, 0, 0);  TraceSet(tr, 2, STB, 0, 0);  dsq[0] = Alphabet_iupac;	/* guard byte */  tpos = 3;  i    = 0;  for (k = 1; k <= hmm->M; k++)    {      if (mp[k] >= 0.5)	{	  x = FArgMax(hmm->mat[k], Alphabet_size);	  TraceSet(tr, tpos, STM, k, i+1);	  seq[i]   = Alphabet[x];	  dsq[i+1] = x;	  if (hmm->mat[k][x] < mthresh)	    seq[i] = tolower((int) seq[i]);	  i++;	  tpos++;	}      else	{	  TraceSet(tr, tpos, STD, k, 0);	  tpos++;	}      if (k < hmm->M && ip[k] >= 0.5)	{	  x = (int) (1.f / (1.f - hmm->t[k][TII]));	  while (x--) 	    {	      TraceSet(tr, tpos, STI, k, i+1);	      seq[i]   = 'x';	      dsq[i+1] = Alphabet_iupac - 1;	      i++; 	      tpos++;	    }	}    }  TraceSet(tr, tpos, STE, 0, 0); tpos++;  TraceSet(tr, tpos, STC, 0, 0); tpos++;  TraceSet(tr, tpos, STT, 0, 0); tpos++;  dsq[i+1] = Alphabet_iupac;      free(mp);  free(ip);  free(dp);  if (ret_seq != NULL) *ret_seq = seq; else free(seq);  if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);  if (ret_L   != NULL) *ret_L   = i;     if (ret_tr  != NULL) *ret_tr  = tr;  else P7FreeTrace(tr);}/* Function: StateOccupancy() * Date:     SRE, Wed Nov 11 09:46:15 1998 [St. Louis] * * Purpose:  Calculate the expected state occupancy for *           a given HMM in generated traces. *            *           Note that expected prob of getting into *           any special state in a trace is trivial: *              S,N,B,E,C,T = 1.0 *              J = E->J transition prob  * * Args:     hmm    - the model *           ret_mp - RETURN: [1..M] prob's of occupying M *           ret_ip - RETURN: [1..M-1] prob's of occupying I *           ret_dp - RETURN: [1..M] prob's of occupying D * * Returns:  void *           mp, ip, dp are malloc'ed here. Caller must free(). */voidStateOccupancy(struct plan7_s *hmm, float **ret_mp, float **ret_ip, float **ret_dp){  float *fmp, *fip, *fdp;       /* forward probabilities  */  int k;			/* counter for nodes      */  /* Initial allocations   */  fmp = MallocOrDie (sizeof(float) * (hmm->M+1));  fip = MallocOrDie (sizeof(float) * (hmm->M));  fdp = MallocOrDie (sizeof(float) * (hmm->M+1));    /* Forward pass.    */  fdp[1] = hmm->tbd1;  fmp[1] = hmm->begin[1];  fip[1] = fmp[1] * hmm->t[1][TMI];  for (k = 2; k <= hmm->M; k++)    {			/* M: from M,D,I at k-1, or B; count t_II as 1.0 */      fmp[k] = fmp[k-1] * hmm->t[k-1][TMM] +	       fip[k-1] +               fdp[k-1] * hmm->t[k-1][TDM] +	       hmm->begin[k];			/* D: from M,D at k-1 */      fdp[k] = fmp[k-1] * hmm->t[k-1][TMD] +	       fdp[k-1] * hmm->t[k-1][TDD];			/* I: from M at k; don't count II */      if (k < hmm->M) {	fip[k] = fmp[k] * hmm->t[k][TMI];      }      SQD_DASSERT2((fabs(1.0f - fmp[k] - fdp[k]) < 1e-6f));      fmp[k] /= fmp[k]+fdp[k];	/* prevent propagating fp errors */      fdp[k] /= fmp[k]+fdp[k];    }  /* We don't need a backward pass; all backwards P's are 1.0   * by definition (you can always get out of a state with P=1).   * The only situation where this might not be true is for   * a TII of 1.0, when TIM = 0 -- but in that case, if there's   * a finite chance of getting into that insert state, the model   * generates infinitely long sequences, so we can consider this   * situation "perverse" and disallow it elsewhere in building   * profile HMMs.   */  /* Return.   */  *ret_mp = fmp;  *ret_dp = fdp;  *ret_ip = fip;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?