trace.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,213 行 · 第 1/3 页

C
1,213
字号
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-2003 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. ************************************************************//* trace.c * SRE, Sat Nov 16 12:34:57 1996 * RCS $Id: trace.c,v 1.20 2003/10/02 16:39:41 eddy Exp $ *  * Support for Plan 7 traceback data structure, p7trace_s. */#include "config.h"#include "squidconf.h"#include <stdio.h>#include <string.h>#include <ctype.h>#include "squid.h"#include "structs.h"#include "funcs.h"static void rightjustify(char *s, int n);/* Function: P7AllocTrace(), P7ReallocTrace(), P7FreeTrace() *  * Purpose:  allocation and freeing of traceback structures */voidP7AllocTrace(int tlen, struct p7trace_s **ret_tr){  struct p7trace_s *tr;    tr =            MallocOrDie (sizeof(struct p7trace_s));  tr->statetype = MallocOrDie (sizeof(char) * tlen);  tr->nodeidx   = MallocOrDie (sizeof(int)  * tlen);  tr->pos       = MallocOrDie (sizeof(int)  * tlen);  *ret_tr = tr;}voidP7ReallocTrace(struct p7trace_s *tr, int tlen){  tr->statetype = ReallocOrDie (tr->statetype, tlen * sizeof(char));  tr->nodeidx   = ReallocOrDie (tr->nodeidx,   tlen * sizeof(int));  tr->pos       = ReallocOrDie (tr->pos,       tlen * sizeof(int));}void P7FreeTrace(struct p7trace_s *tr){  if (tr == NULL) return;  free(tr->pos);  free(tr->nodeidx);  free(tr->statetype);  free(tr);}/* Function: TraceSet() * Date:     SRE, Sun Mar  8 12:39:00 1998 [St. Louis] * * Purpose:  Convenience function; set values at position tpos *           in a trace.  *            * * Args:     tr   - trace object to write to *           tpos - ptr to position in trace to set *           type - statetype e.g. STS, etc. *           idx  - nodeidx 1..M or 0 *           pos  - seq position 1..L or 0 * * Returns:  void */voidTraceSet(struct p7trace_s *tr, int tpos, char type, int idx, int pos){  tr->statetype[tpos] = type;  tr->nodeidx[tpos]   = idx;  tr->pos[tpos]       = pos;}/* Function: MergeTraceArrays() * Date:     SRE, Sun Jul  5 15:09:10 1998 [St. Louis] * * Purpose:  Combine two arrays of traces into a single array. *           Used in hmmalign to merge traces from a fixed alignment *           with traces from individual unaligned seqs. *  *           t1 traces always precede t2 traces in the resulting array. * * Args:     t1 - first set of traces *           n1 - number of traces in t1 *           t2 - second set of traces *           n2 - number of traces in t2 * * Returns:  pointer to new array of traces. *           Both t1 and t2 are free'd here! Do not reuse. */struct p7trace_s **MergeTraceArrays(struct p7trace_s **t1, int n1, struct p7trace_s **t2, int n2){  struct p7trace_s **tr;  int i;			/* index in traces */  tr = MallocOrDie(sizeof(struct p7trace_s *) * (n1+n2));  for (i = 0; i < n1; i++) tr[i]    = t1[i];  for (i = 0; i < n2; i++) tr[n1+i] = t2[i];  free(t1);  free(t2);  return tr;}/* Function: P7ReverseTrace() * Date:     SRE, Mon Aug 25 12:57:29 1997; Denver CO.  *  * Purpose:  Reverse the arrays in a traceback structure. *           Tracebacks from Forward() and Viterbi() are *           collected backwards, and call this function *           when they're done. *            *           It's possible to reverse the arrays in place *           more efficiently; but the realloc/copy strategy *           has the advantage of reallocating the trace *           into the right size of memory. (Tracebacks *           overallocate.) *            * Args:     tr - the traceback to reverse. tr->tlen must be set. *                 * Return:   (void) *           tr is modified. */                voidP7ReverseTrace(struct p7trace_s *tr){  char  *statetype;  int   *nodeidx;  int   *pos;  int    opos, npos;  /* Allocate   */  statetype = MallocOrDie (sizeof(char)* tr->tlen);  nodeidx   = MallocOrDie (sizeof(int) * tr->tlen);  pos       = MallocOrDie (sizeof(int) * tr->tlen);    /* Reverse the trace.   */  for (opos = tr->tlen-1, npos = 0; npos < tr->tlen; npos++, opos--)    {      statetype[npos] = tr->statetype[opos];      nodeidx[npos]   = tr->nodeidx[opos];      pos[npos]       = tr->pos[opos];    }  /* Swap old, new arrays.   */  free(tr->statetype);  free(tr->nodeidx);  free(tr->pos);  tr->statetype = statetype;  tr->nodeidx   = nodeidx;  tr->pos       = pos;}/* Function: P7TraceCount() *  * Purpose:  Count a traceback into a count-based HMM structure. *           (Usually as part of a model parameter re-estimation.) *            * Args:     hmm   - counts-based HMM *           dsq   - digitized sequence that traceback aligns to the HMM (1..L) *           wt    - weight on the sequence *           tr    - alignment of seq to HMM *            * Return:   (void) */voidP7TraceCount(struct plan7_s *hmm, unsigned char *dsq, float wt, struct p7trace_s *tr){  int tpos;                     /* position in tr */  int i;			/* symbol position in seq */    for (tpos = 0; tpos < tr->tlen; tpos++)    {      i = tr->pos[tpos];      /* Emission counts.        * Don't bother counting null states N,J,C.       */      if (tr->statetype[tpos] == STM) 	P7CountSymbol(hmm->mat[tr->nodeidx[tpos]], dsq[i], wt);      else if (tr->statetype[tpos] == STI) 	P7CountSymbol(hmm->ins[tr->nodeidx[tpos]], dsq[i], wt);      /* State transition counts       */      switch (tr->statetype[tpos]) {      case STS:	break;			/* don't bother; P=1 */      case STN:	switch (tr->statetype[tpos+1]) {	case STB: hmm->xt[XTN][MOVE] += wt; break;	case STN: hmm->xt[XTN][LOOP] += wt; break;	default:	  Die("illegal state transition %s->%s in traceback", 	      Statetype(tr->statetype[tpos]),	      Statetype(tr->statetype[tpos+1]));	}	break;      case STB:	switch (tr->statetype[tpos+1]) {	case STM: hmm->begin[tr->nodeidx[tpos+1]] += wt; break;	case STD: hmm->tbd1 += wt;                       break;	default:      	  Die("illegal state transition %s->%s in traceback", 	      Statetype(tr->statetype[tpos]),	      Statetype(tr->statetype[tpos+1]));	}	break;      case STM:	switch (tr->statetype[tpos+1]) {	case STM: hmm->t[tr->nodeidx[tpos]][TMM] += wt; break;	case STI: hmm->t[tr->nodeidx[tpos]][TMI] += wt; break;	case STD: hmm->t[tr->nodeidx[tpos]][TMD] += wt; break;	case STE: hmm->end[tr->nodeidx[tpos]]    += wt; break;	default:    	  Die("illegal state transition %s->%s in traceback", 	      Statetype(tr->statetype[tpos]),	      Statetype(tr->statetype[tpos+1]));	}	break;      case STI:	switch (tr->statetype[tpos+1]) {	case STM: hmm->t[tr->nodeidx[tpos]][TIM] += wt; break;	case STI: hmm->t[tr->nodeidx[tpos]][TII] += wt; break;	default:    	  Die("illegal state transition %s->%s in traceback", 	      Statetype(tr->statetype[tpos]),	      Statetype(tr->statetype[tpos+1]));	}	break;      case STD:	switch (tr->statetype[tpos+1]) {	case STM: hmm->t[tr->nodeidx[tpos]][TDM] += wt; break;	case STD: hmm->t[tr->nodeidx[tpos]][TDD] += wt; break;	case STE: /* ignore; p(D->E) = 1.0 */           break;	default: 	  Die("illegal state transition %s->%s in traceback", 	      Statetype(tr->statetype[tpos]),	      Statetype(tr->statetype[tpos+1]));	}	break;      case STE:	switch (tr->statetype[tpos+1]) {	case STC: hmm->xt[XTE][MOVE] += wt; break;	case STJ: hmm->xt[XTE][LOOP] += wt; break;	default:     	  Die("illegal state transition %s->%s in traceback", 	      Statetype(tr->statetype[tpos]),	      Statetype(tr->statetype[tpos+1]));	}	break;     case STJ:	switch (tr->statetype[tpos+1]) {	case STB: hmm->xt[XTJ][MOVE] += wt; break;	case STJ: hmm->xt[XTJ][LOOP] += wt; break;	default:     	  Die("illegal state transition %s->%s in traceback", 	      Statetype(tr->statetype[tpos]),	      Statetype(tr->statetype[tpos+1]));	}	break;      case STC:	switch (tr->statetype[tpos+1]) {	case STT: hmm->xt[XTC][MOVE] += wt; break;	case STC: hmm->xt[XTC][LOOP] += wt; break;	default:     	 Die("illegal state transition %s->%s in traceback", 	     Statetype(tr->statetype[tpos]),	     Statetype(tr->statetype[tpos+1])); 	}	break;      case STT:	break;			/* T is the last. It makes no transitions. */      default:	Die("illegal state %s in traceback", 	    Statetype(tr->statetype[tpos]));      }    }}/* Function: P7TraceScore() *  * Purpose:  Score a traceback and return the score in scaled bits. *            * Args:     hmm   - HMM with valid log odds scores. *           dsq   - digitized sequence that traceback aligns to the HMM (1..L) *           tr    - alignment of seq to HMM *            * Return:   (void) */floatP7TraceScore(struct plan7_s *hmm, unsigned char *dsq, struct p7trace_s *tr){  int score;			/* total score as a scaled integer */  int tpos;                     /* position in tr */  unsigned char sym;		/* digitized symbol in dsq */    /*  P7PrintTrace(stdout, tr, hmm, dsq); */  score = 0;  for (tpos = 0; tpos < tr->tlen-1; tpos++)    {      sym = dsq[tr->pos[tpos]];      /* Emissions.       * Don't bother counting null states N,J,C.       */      if (tr->statetype[tpos] == STM) 	score += hmm->msc[sym][tr->nodeidx[tpos]];      else if (tr->statetype[tpos] == STI) 	score += hmm->isc[sym][tr->nodeidx[tpos]];      /* State transitions.       */      score += TransitionScoreLookup(hmm, 			     tr->statetype[tpos], tr->nodeidx[tpos],			     tr->statetype[tpos+1], tr->nodeidx[tpos+1]);    }  return Scorify(score);}/* Function: P7Traces2Alignment() *  * Purpose:  Convert an array of traceback structures for a set *           of sequences into a new multiple alignment.  *            *           Insertions are put into lower case and  *           are not aligned; instead, Nterm is right-justified, *           Cterm is left-justified, and internal insertions *           are split in half and the halves are justified in *           each direction (the objective being to increase *           the chances of getting insertions aligned well enough *           for them to become a match). SAM gap char conventions *           are used: - in match columns, . in insert columns *  * NOTE:     Does not recognize J state. *            *           Can handle traces with D->I and I->D transitions; *           though these are disallowed by Plan7, they might be *           generated by aligning an alignment to a model, in *           the ImposeMasterTrace() step. Thus, --withali might *           generate alignments that are inconsistent with Plan7, *           that would have to be trace_doctor()'ed. *           xref STL6/p.117. * * Args:     dsq        - digitized unaligned sequences  *           sqinfo     - array of info about the sequences *           wgt        - weights on seqs *           nseq       - number of sequences *           mlen       - length of model (number of match states) *           tr         - array of tracebacks *           matchonly  - TRUE if we don't print insert-generated symbols at all * Return:   MSA structure; NULL on failure. *           Caller responsible for freeing msa with MSAFree(msa); */          MSA *P7Traces2Alignment(unsigned char **dsq, SQINFO *sqinfo, float *wgt, int nseq, int mlen, 		   struct p7trace_s **tr, int matchonly) {  MSA   *msa;                   /* RETURN: new alignment */  int    idx;                   /* counter for sequences */  int    alen;                  /* width of alignment */  int   *inserts;               /* array of max gaps between aligned columns */  int   *matmap;                /* matmap[k] = apos of match k [1..M] */  int    nins;                  /* counter for inserts */  int    apos;                  /* position in aligned sequence (0..alen-1)*/  int    rpos;                  /* position in raw digital sequence (1..L)*/  int    tpos;                  /* position counter in traceback */  int    statetype;		/* type of current state, e.g. STM */  int    k;                     /* counter over states in model */  /* Here's the problem. We want to align the match states in columns,   * but some sequences have inserted symbols in them; we need some   * sort of overall knowledge of where the inserts are and how long   * they are in order to create the alignment.   *    * Here's our trick. inserts[] is a 0..hmm->M array; inserts[i] stores   * the maximum number of times insert substate i was used. This   * is the maximum number of gaps to insert between canonical    * column i and i+1.  inserts[0] is the N-term tail; inserts[M] is   * the C-term tail.   *    * Remember that N and C emit on transition, hence the check for an

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?