⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 modelmakers.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 3 页
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* modelmakers.c * SRE, Fri Nov 15 10:00:04 1996 *  * Construction of models from multiple alignments. Three versions: *    Handmodelmaker() -- use #=RF annotation to indicate match columns *    Fastmodelmaker() -- Krogh/Haussler heuristic *    Maxmodelmaker()  -- MAP model construction algorithm (Eddy,  *                          unpublished) *                           * The meat of the model construction code is in matassign2hmm(). * The three model construction strategies simply label which columns * are supposed to be match states, and then hand this info to * matassign2hmm(). *  * Two wrinkles to watch for: * 1) The alignment is assumed to contain sequence fragments. Look in *    fake_tracebacks() for how internal entry/exit points are handled. * 2) Plan7 disallows DI and ID transitions, but an alignment may *    imply these. Look in trace_doctor() for how DI and ID transitions *    are removed. */#include <stdio.h>#include <string.h>#include <limits.h>#include <math.h>#include <float.h>#include <ctype.h>#include "structs.h"#include "config.h"#include "funcs.h"#include "squid.h"#include "msa.h"/* flags used for matassign[] arrays --  *   assignment of aligned columns to match/insert states */#define ASSIGN_MATCH      (1<<0) #define FIRST_MATCH       (1<<1)#define LAST_MATCH        (1<<2)#define ASSIGN_INSERT     (1<<3)#define EXTERNAL_INSERT_N (1<<4)#define EXTERNAL_INSERT_C (1<<5) static int build_cij(char **aseqs, int nseq, int *insopt, int i, int j,		     float *wgt, float *cij);static int estimate_model_length(MSA *msa);static void matassign2hmm(MSA *msa, char **dsq,			  int *matassign, struct plan7_s **ret_hmm,			  struct p7trace_s ***ret_tr);static void fake_tracebacks(char **aseq, int nseq, int alen, int *matassign,			    struct p7trace_s ***ret_tr);static void trace_doctor(struct p7trace_s *tr, int M, int *ret_ndi, 			 int *ret_nid);static void annotate_model(struct plan7_s *hmm, int *matassign, MSA *msa);static void print_matassign(int *matassign, int alen);/* Function: P7Handmodelmaker() *  * Purpose:  Manual model construction: *           Construct an HMM from an alignment, where the #=RF line *           of a HMMER alignment file is given to indicate *           the columns assigned to matches vs. inserts. *            *           NOTE: Handmodelmaker() will slightly revise the alignment *           if necessary, if the assignment of columns implies *           DI and ID transitions. *            *           Returns both the HMM in counts form (ready for applying *           Dirichlet priors as the next step), and fake tracebacks *           for each aligned sequence. *            * Args:     msa  - multiple sequence alignment           *           dsq  - digitized unaligned aseq's *           ret_hmm - RETURN: counts-form HMM *           ret_tr  - RETURN: array of tracebacks for aseq's *            * Return:   (void) *           ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr), *           FreeHMM(hmm). */            voidP7Handmodelmaker(MSA *msa, char **dsq,		 struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr){  int     *matassign;           /* MAT state assignments if 1; 1..alen */  int      apos;                /* counter for aligned columns         */  /* Make sure we have all the info about the alignment that we need */  if (msa->rf == NULL)    Die("Alignment must have RF annotation to hand-build an HMM");				/* Allocation */  matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1));    /* Determine match assignment from optional annotation   */  matassign[0] = 0;  for (apos = 0; apos < msa->alen; apos++)    {      matassign[apos+1] = 0;      if (!isgap(msa->rf[apos])) 	matassign[apos+1] |= ASSIGN_MATCH;      else 	matassign[apos+1] |= ASSIGN_INSERT;    }  /* Hand matassign off for remainder of model construction   */  /*   print_matassign(matassign, msa->alen); */  matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr);  free(matassign);  return;}/* Function: P7Fastmodelmaker() *  * Purpose:  Heuristic model construction: *           Construct an HMM from an alignment using the original *           Krogh/Haussler heuristic; any column with more  *           symbols in it than a given fraction is assigned to *           match. *            *           NOTE: Fastmodelmaker() will slightly revise the *           alignment if the assignment of columns implies *           DI and ID transitions. *            *           Returns the HMM in counts form (ready for applying Dirichlet *           priors as the next step). Also returns fake traceback *           for each training sequence. *            * Args:     msa     - multiple sequence alignment *           dsq     - digitized unaligned aseq's *           maxgap  - if more gaps than this, column becomes insert. *           ret_hmm - RETURN: counts-form HMM *           ret_tr  - RETURN: array of tracebacks for aseq's *            * Return:   (void) *           ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr), *           FreeHMM(hmm).        */voidP7Fastmodelmaker(MSA *msa, char **dsq, float maxgap,		 struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr){  int     *matassign;           /* MAT state assignments if 1; 1..alen */  int      idx;                 /* counter over sequences              */  int      apos;                /* counter for aligned columns         */  int      ngap;		/* number of gaps in a column          */  /* Allocations: matassign is 1..alen array of bit flags   */  matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1));    /* Determine match assignment by counting symbols in columns   */  matassign[0] = 0;  for (apos = 0; apos < msa->alen; apos++) {      matassign[apos+1] = 0;      ngap = 0;      for (idx = 0; idx < msa->nseq; idx++) 	if (isgap(msa->aseq[idx][apos])) 	  ngap++;            if ((float) ngap / (float) msa->nseq > maxgap)	matassign[apos+1] |= ASSIGN_INSERT;      else	matassign[apos+1] |= ASSIGN_MATCH;  }  /* Once we have matassign calculated, all modelmakers behave   * the same; matassign2hmm() does this stuff (traceback construction,   * trace counting) and sets up ret_hmm and ret_tr.   */  matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr);  free(matassign);  return;}  /* Function: P7Maxmodelmaker() *  * Purpose:  The Unholy Beast of HMM model construction algorithms -- *           maximum a posteriori construction. A tour de force and *           probably overkill. MAP construction for Krogh  *           HMM-profiles is fairly straightforward, but MAP construction of *           Plan 7 HMM-profiles is, er, intricate. *            *           Given a multiple alignment, construct an optimal (MAP) model *           architecture. Return a counts-based HMM. *            * Args:     msa     - multiple sequence alignment  *           dsq     - digitized, unaligned seqs *           maxgap  - above this, trailing columns are assigned to C            *           prior   - priors on parameters to use for model construction *           null    - random sequence model emissions *           null_p1 - random sequence model p1 transition  *           mpri    - prior on architecture: probability of new match node *           ret_hmm - RETURN: new hmm (counts form) *           ret_tr  - RETURN: array of tracebacks for aseq's * * Return:   (void) *           ret_hmm and ret_tr (if !NULL) must be free'd by the caller. */          voidP7Maxmodelmaker(MSA *msa, char **dsq, float maxgap,		struct p7prior_s *prior, 		float *null, float null_p1, float mpri, 		struct plan7_s **ret_hmm, struct p7trace_s  ***ret_tr){  int     idx;			/* counter for seqs                */  int     i, j;			/* positions in alignment          */  int     x;			/* counter for syms or transitions */  float **matc;                 /* count vectors: [1..alen][0..19] */   float   cij[8], tij[8];	/* count and score transit vectors */  float   matp[MAXABET];        /* match emission vector           */  float   insp[MAXABET];	/* insert score vector             */  float   insc[MAXABET];	/* insert count vector             */  float  *sc;                   /* DP scores [0,1..alen,alen+1]    */  int    *tbck;                 /* traceback ptrs for sc           */  int    *matassign;            /* match assignments [1..alen]     */   int    *insopt;		/* number of inserted chars [0..nseq-1] */  int     first, last;		/* positions of first and last cols [1..alen] */  float   bm1, bm2;		/* estimates for start,internal b->m t's  */  int     est_M;		/* estimate for the size of the model */  float   t_me;			/* estimate for an internal M->E transition */  float   new, bestsc;		/* new score, best score so far     */  int     code;			/* optimization: return code from build_cij() */  int     ngap;			/* gap count in a column                      */  float   wgtsum;		/* sum of weights; do not assume it is nseq */  /* Allocations   */  matc      = (float **) MallocOrDie (sizeof(float *) * (msa->alen+1));  sc        = (float *)  MallocOrDie (sizeof(float)   * (msa->alen+2));  tbck      = (int *)    MallocOrDie (sizeof(int)     * (msa->alen+2));  matassign = (int *)    MallocOrDie (sizeof(int)     * (msa->alen+1));  insopt    = (int *)    MallocOrDie (sizeof(int)     * msa->nseq);      for (i = 0; i < msa->alen; i++) {    matc[i+1] = (float *) MallocOrDie (Alphabet_size * sizeof(float));    FSet(matc[i+1], Alphabet_size, 0.);  }  /* Precalculations   */  for (i = 0; i < msa->alen; i++)     for (idx = 0; idx < msa->nseq; idx++)       if (!isgap(msa->aseq[idx][i])) 	P7CountSymbol(matc[i+1], SymbolIndex(msa->aseq[idx][i]), msa->wgt[idx]);  mpri = sreLOG2(mpri);    FCopy(insp, prior->i[0], Alphabet_size);  FNorm(insp, Alphabet_size);  wgtsum = FSum(msa->wgt, msa->nseq);  for (x = 0; x < Alphabet_size; x++)    insp[x] = sreLOG2(insp[x] / null[x]);    /* Estimate the relevant special transitions.   */  est_M = estimate_model_length(msa);  t_me  = 0.5 / (float) (est_M-1);  bm1   = 0.5;  bm2   = 0.5 / (float) (est_M-1);  bm1   = sreLOG2(bm1 / null_p1);  bm2   = sreLOG2(bm2 / null_p1);  /* Estimate the position of the last match-assigned column   * by counting gap frequencies.   */   maxgap = 0.5;  for (last = msa->alen; last >= 1; last--) {    ngap = 0;    for (idx = 0; idx < msa->nseq; idx++)      if (isgap(msa->aseq[idx][last-1])) ngap++;    if ((float) ngap / (float) msa->nseq <= maxgap)      break;  }  /* Initialization   */  sc[last]   = 0.;  tbck[last] = 0;  /* Set ME gaps to '_'   */  for (idx = 0; idx < msa->nseq; idx++)     for (i = last; i > 0 && isgap(msa->aseq[idx][i-1]); i--)      msa->aseq[idx][i-1] = '_';  /* Main recursion moves from right to left.   */  for (i = last-1; i > 0; i--) {				/* Calculate match emission scores for i  */    FCopy(matp, matc[i], Alphabet_size);    P7PriorifyEmissionVector(matp, prior, prior->mnum, prior->mq, prior->m, NULL);     for (x = 0; x < Alphabet_size; x++)      matp[x] = sreLOG2(matp[x] / null[x]);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -