📄 modelmakers.c
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. *****************************************************************//* modelmakers.c * SRE, Fri Nov 15 10:00:04 1996 * * Construction of models from multiple alignments. Three versions: * Handmodelmaker() -- use #=RF annotation to indicate match columns * Fastmodelmaker() -- Krogh/Haussler heuristic * Maxmodelmaker() -- MAP model construction algorithm (Eddy, * unpublished) * * The meat of the model construction code is in matassign2hmm(). * The three model construction strategies simply label which columns * are supposed to be match states, and then hand this info to * matassign2hmm(). * * Two wrinkles to watch for: * 1) The alignment is assumed to contain sequence fragments. Look in * fake_tracebacks() for how internal entry/exit points are handled. * 2) Plan7 disallows DI and ID transitions, but an alignment may * imply these. Look in trace_doctor() for how DI and ID transitions * are removed. */#include <stdio.h>#include <string.h>#include <limits.h>#include <math.h>#include <float.h>#include <ctype.h>#include "structs.h"#include "config.h"#include "funcs.h"#include "squid.h"#include "msa.h"/* flags used for matassign[] arrays -- * assignment of aligned columns to match/insert states */#define ASSIGN_MATCH (1<<0) #define FIRST_MATCH (1<<1)#define LAST_MATCH (1<<2)#define ASSIGN_INSERT (1<<3)#define EXTERNAL_INSERT_N (1<<4)#define EXTERNAL_INSERT_C (1<<5) static int build_cij(char **aseqs, int nseq, int *insopt, int i, int j, float *wgt, float *cij);static int estimate_model_length(MSA *msa);static void matassign2hmm(MSA *msa, char **dsq, int *matassign, struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr);static void fake_tracebacks(char **aseq, int nseq, int alen, int *matassign, struct p7trace_s ***ret_tr);static void trace_doctor(struct p7trace_s *tr, int M, int *ret_ndi, int *ret_nid);static void annotate_model(struct plan7_s *hmm, int *matassign, MSA *msa);static void print_matassign(int *matassign, int alen);/* Function: P7Handmodelmaker() * * Purpose: Manual model construction: * Construct an HMM from an alignment, where the #=RF line * of a HMMER alignment file is given to indicate * the columns assigned to matches vs. inserts. * * NOTE: Handmodelmaker() will slightly revise the alignment * if necessary, if the assignment of columns implies * DI and ID transitions. * * Returns both the HMM in counts form (ready for applying * Dirichlet priors as the next step), and fake tracebacks * for each aligned sequence. * * Args: msa - multiple sequence alignment * dsq - digitized unaligned aseq's * ret_hmm - RETURN: counts-form HMM * ret_tr - RETURN: array of tracebacks for aseq's * * Return: (void) * ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr), * FreeHMM(hmm). */ voidP7Handmodelmaker(MSA *msa, char **dsq, struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr){ int *matassign; /* MAT state assignments if 1; 1..alen */ int apos; /* counter for aligned columns */ /* Make sure we have all the info about the alignment that we need */ if (msa->rf == NULL) Die("Alignment must have RF annotation to hand-build an HMM"); /* Allocation */ matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1)); /* Determine match assignment from optional annotation */ matassign[0] = 0; for (apos = 0; apos < msa->alen; apos++) { matassign[apos+1] = 0; if (!isgap(msa->rf[apos])) matassign[apos+1] |= ASSIGN_MATCH; else matassign[apos+1] |= ASSIGN_INSERT; } /* Hand matassign off for remainder of model construction */ /* print_matassign(matassign, msa->alen); */ matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr); free(matassign); return;}/* Function: P7Fastmodelmaker() * * Purpose: Heuristic model construction: * Construct an HMM from an alignment using the original * Krogh/Haussler heuristic; any column with more * symbols in it than a given fraction is assigned to * match. * * NOTE: Fastmodelmaker() will slightly revise the * alignment if the assignment of columns implies * DI and ID transitions. * * Returns the HMM in counts form (ready for applying Dirichlet * priors as the next step). Also returns fake traceback * for each training sequence. * * Args: msa - multiple sequence alignment * dsq - digitized unaligned aseq's * maxgap - if more gaps than this, column becomes insert. * ret_hmm - RETURN: counts-form HMM * ret_tr - RETURN: array of tracebacks for aseq's * * Return: (void) * ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr), * FreeHMM(hmm). */voidP7Fastmodelmaker(MSA *msa, char **dsq, float maxgap, struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr){ int *matassign; /* MAT state assignments if 1; 1..alen */ int idx; /* counter over sequences */ int apos; /* counter for aligned columns */ int ngap; /* number of gaps in a column */ /* Allocations: matassign is 1..alen array of bit flags */ matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1)); /* Determine match assignment by counting symbols in columns */ matassign[0] = 0; for (apos = 0; apos < msa->alen; apos++) { matassign[apos+1] = 0; ngap = 0; for (idx = 0; idx < msa->nseq; idx++) if (isgap(msa->aseq[idx][apos])) ngap++; if ((float) ngap / (float) msa->nseq > maxgap) matassign[apos+1] |= ASSIGN_INSERT; else matassign[apos+1] |= ASSIGN_MATCH; } /* Once we have matassign calculated, all modelmakers behave * the same; matassign2hmm() does this stuff (traceback construction, * trace counting) and sets up ret_hmm and ret_tr. */ matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr); free(matassign); return;} /* Function: P7Maxmodelmaker() * * Purpose: The Unholy Beast of HMM model construction algorithms -- * maximum a posteriori construction. A tour de force and * probably overkill. MAP construction for Krogh * HMM-profiles is fairly straightforward, but MAP construction of * Plan 7 HMM-profiles is, er, intricate. * * Given a multiple alignment, construct an optimal (MAP) model * architecture. Return a counts-based HMM. * * Args: msa - multiple sequence alignment * dsq - digitized, unaligned seqs * maxgap - above this, trailing columns are assigned to C * prior - priors on parameters to use for model construction * null - random sequence model emissions * null_p1 - random sequence model p1 transition * mpri - prior on architecture: probability of new match node * ret_hmm - RETURN: new hmm (counts form) * ret_tr - RETURN: array of tracebacks for aseq's * * Return: (void) * ret_hmm and ret_tr (if !NULL) must be free'd by the caller. */ voidP7Maxmodelmaker(MSA *msa, char **dsq, float maxgap, struct p7prior_s *prior, float *null, float null_p1, float mpri, struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr){ int idx; /* counter for seqs */ int i, j; /* positions in alignment */ int x; /* counter for syms or transitions */ float **matc; /* count vectors: [1..alen][0..19] */ float cij[8], tij[8]; /* count and score transit vectors */ float matp[MAXABET]; /* match emission vector */ float insp[MAXABET]; /* insert score vector */ float insc[MAXABET]; /* insert count vector */ float *sc; /* DP scores [0,1..alen,alen+1] */ int *tbck; /* traceback ptrs for sc */ int *matassign; /* match assignments [1..alen] */ int *insopt; /* number of inserted chars [0..nseq-1] */ int first, last; /* positions of first and last cols [1..alen] */ float bm1, bm2; /* estimates for start,internal b->m t's */ int est_M; /* estimate for the size of the model */ float t_me; /* estimate for an internal M->E transition */ float new, bestsc; /* new score, best score so far */ int code; /* optimization: return code from build_cij() */ int ngap; /* gap count in a column */ float wgtsum; /* sum of weights; do not assume it is nseq */ /* Allocations */ matc = (float **) MallocOrDie (sizeof(float *) * (msa->alen+1)); sc = (float *) MallocOrDie (sizeof(float) * (msa->alen+2)); tbck = (int *) MallocOrDie (sizeof(int) * (msa->alen+2)); matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1)); insopt = (int *) MallocOrDie (sizeof(int) * msa->nseq); for (i = 0; i < msa->alen; i++) { matc[i+1] = (float *) MallocOrDie (Alphabet_size * sizeof(float)); FSet(matc[i+1], Alphabet_size, 0.); } /* Precalculations */ for (i = 0; i < msa->alen; i++) for (idx = 0; idx < msa->nseq; idx++) if (!isgap(msa->aseq[idx][i])) P7CountSymbol(matc[i+1], SymbolIndex(msa->aseq[idx][i]), msa->wgt[idx]); mpri = sreLOG2(mpri); FCopy(insp, prior->i[0], Alphabet_size); FNorm(insp, Alphabet_size); wgtsum = FSum(msa->wgt, msa->nseq); for (x = 0; x < Alphabet_size; x++) insp[x] = sreLOG2(insp[x] / null[x]); /* Estimate the relevant special transitions. */ est_M = estimate_model_length(msa); t_me = 0.5 / (float) (est_M-1); bm1 = 0.5; bm2 = 0.5 / (float) (est_M-1); bm1 = sreLOG2(bm1 / null_p1); bm2 = sreLOG2(bm2 / null_p1); /* Estimate the position of the last match-assigned column * by counting gap frequencies. */ maxgap = 0.5; for (last = msa->alen; last >= 1; last--) { ngap = 0; for (idx = 0; idx < msa->nseq; idx++) if (isgap(msa->aseq[idx][last-1])) ngap++; if ((float) ngap / (float) msa->nseq <= maxgap) break; } /* Initialization */ sc[last] = 0.; tbck[last] = 0; /* Set ME gaps to '_' */ for (idx = 0; idx < msa->nseq; idx++) for (i = last; i > 0 && isgap(msa->aseq[idx][i-1]); i--) msa->aseq[idx][i-1] = '_'; /* Main recursion moves from right to left. */ for (i = last-1; i > 0; i--) { /* Calculate match emission scores for i */ FCopy(matp, matc[i], Alphabet_size); P7PriorifyEmissionVector(matp, prior, prior->mnum, prior->mq, prior->m, NULL); for (x = 0; x < Alphabet_size; x++) matp[x] = sreLOG2(matp[x] / null[x]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -