📄 structs.h
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* structs.h * * Data structures used in HMMER. * Also, a few miscellaneous macros and global variable declarations. * * RCS $Id: structs.h,v 1.22 2001/06/07 17:38:48 eddy Exp $ */#ifndef STRUCTSH_INCLUDED#define STRUCTSH_INCLUDED#include "squid.h"#include "config.h"#include "ssi.h"/* Miscellaneous math macros used in the package */#define sreLOG2(x) ((x) > 0 ? log(x) * 1.44269504 : -9999.)#define sreEXP2(x) (exp((x) * 0.69314718 )) #define SQR(x) ((x) * (x))/* an idiom for determining a symbol's position in the array * by pointer arithmetic. * does no error checking, so caller must already be damned sure x is * valid in the alphabet! */#define SYMIDX(x) (strchr(Alphabet, (x)) - Alphabet)/* The symbol alphabet. * Must deal with IUPAC degeneracies. Nondegenerate symbols * come first in Alphabet[], followed by degenerate symbols. * Nucleic alphabet also must deal with other common symbols * like U (in RNA) and X (often misused for N). * Example: * Nucleic: "ACGTUNRYMKSWHBVDX" size=4 iupac=17 * Amino: "ACDEFGHIKLMNPQRSTVWYBZX" size=20 iupac=23 * * Parts of the code assume that the last symbol is a * symbol for an unknown residue, i.e. 'X'. * * MAXCODE and MAXABET constants are defined in config.h */ extern char Alphabet[MAXCODE]; /* "ACDEFGHIKLMNPQRSTVWYBZX" for example */extern int Alphabet_type; /* hmmNUCLEIC or hmmAMINO */extern int Alphabet_size; /* uniq alphabet size: 4 or 20 */extern int Alphabet_iupac; /* total size of alphabet + IUPAC degen. */extern char Degenerate[MAXCODE][MAXABET];extern int DegenCount[MAXCODE];#define hmmNOTSETYET 0#define hmmNUCLEIC 2 /* compatibility with squid's kRNA */#define hmmAMINO 3 /* compatibility with squid's kAmino *//********************************************************************** * * Plan7 * Implementation of the new Plan7 HMM architecture. * Fully probabilistic even for hmmsw, hmmls, and hmmfs; * No insert->delete or delete->insert transitions; * Improved structure layout. * * The strategy is to infiltrate plan7 code into HMMER in * an evolutionary rather than revolutionary manner. * **********************************************************************//* Plan 7 construction strategies. */enum p7_construction { P7_MAP_CONSTRUCTION, /* maximum a posteriori architecture */ P7_HAND_CONSTRUCTION, /* hand specified architecture */ P7_FAST_CONSTRUCTION /* fast ad hoc architecture */};/* Plan 7 parameter optimization strategies */enum p7_param { P7_MAP_PARAM, /* standard maximum a posteriori */ P7_MD_PARAM, /* maximum discrimination */ P7_MRE_PARAM, /* maximum relative entropy */ P7_WMAP_PARAM /* ad hoc weighted MAP */};/* Structure: plan7_s * * Declaration of a Plan 7 profile-HMM. */struct plan7_s { /* Annotation on the model. A name is mandatory. * Other fields are optional; whether they are present is * flagged in the stateflags bit array. * * desc is only valid if PLAN7_DESC is set in flags. * acc is only valid if PLAN7_ACC is set in flags. * rf is only valid if PLAN7_RF is set in flags. * cs is only valid if PLAN7_CS is set in flags. * ca is only valid if PLAN7_CA is set in flags. * map is only valid if PLAN7_MAP is set in flags. */ char *name; /* name of the model +*/ char *acc; /* accession number of model (Pfam) +*/ char *desc; /* brief description of model +*/ char *rf; /* reference line from alignment 0..M +*/ char *cs; /* consensus structure line 0..M +*/ char *ca; /* consensus accessibility line 0..M */ char *comlog; /* command line(s) that built model +*/ int nseq; /* number of training sequences +*/ char *ctime; /* creation date +*/ int *map; /* map of alignment cols onto model 1..M+*/ int checksum; /* checksum of training sequences +*/ /* The following are annotations added to support work by Michael Asman, * CGR Stockholm. They are not stored in model files; they are only * used in model construction. * * #=GC X-PRM (PRT,PRI) annotation is picked up by hmmbuild and interpreted * as specifying which mixture Dirichlet component to use. If these flags * are non-NULL, the normal mixture Dirichlet code is bypassed, and a * single specific Dirichlet is used at each position. */ int *tpri; /* which transition mixture prior to use */ int *mpri; /* which match mixture prior to use */ int *ipri; /* which insert mixture prior to use */ /* Pfam-specific score cutoffs. * * ga1, ga2 are valid if PLAN7_GA is set in flags. * tc1, tc2 are valid if PLAN7_TC is set in flags. * nc1, nc2 are valid if PLAN7_NC is set in flags. */ float ga1, ga2; /* per-seq/per-domain gathering thresholds (bits) +*/ float tc1, tc2; /* per-seq/per-domain trusted cutoff (bits) +*/ float nc1, nc2; /* per-seq/per-domain noise cutoff (bits) +*/ /* The main model in probability form: data-dependent probabilities. * This is the core Krogh/Haussler model. * Transition probabilities are usually accessed as a * two-D array: hmm->t[k][TMM], for instance. They are allocated * such that they can also be stepped through in 1D by pointer * manipulations, for efficiency in DP algorithms. */ int M; /* length of the model (# nodes) +*/ float **t; /* transition prob's. t[1..M-1][0..6] +*/ float **mat; /* match emissions. mat[1..M][0..19] +*/ float **ins; /* insert emissions. ins[1..M-1][0..19] +*/ float tbd1; /* B->D1 prob (data dependent) +*/ /* The unique states of Plan 7 in probability form. * These are the algorithm-dependent, data-independent probabilities. * Some parts of the code may briefly use a trick of copying tbd1 * into begin[0]; this makes it easy to call FChoose() or FNorm() * on the resulting vector. However, in general begin[0] is not * a valid number. */ float xt[4][2]; /* N,E,C,J extra states: 2 transitions +*/ float *begin; /* 1..M B->M state transitions +*/ float *end; /* 1..M M->E state transitions (!= a dist!) +*/ /* The null model probabilities. */ float null[MAXABET]; /* "random sequence" emission prob's +*/ float p1; /* null model loop probability +*/ /* The model in log-odds score form. * These are created from the probabilities by LogoddsifyHMM(). * By definition, null[] emission scores are all zero. * Note that emission distributions are over 26 upper-case letters, * not just the unambiguous protein or DNA alphabet: we * precalculate the scores for all IUPAC degenerate symbols we * may see. Non-IUPAC symbols simply have a -INFTY score. * Note the reversed indexing on msc and isc -- for efficiency reasons. * * Only valid if PLAN7_HASBITS is set. */ int **tsc; /* transition scores [1.M-1][0.6] -*/ int **msc; /* match emission scores [0.MAXCODE-1][1.M] -*/ int **isc; /* ins emission scores [0.MAXCODE-1][1.M-1] -*/ int xsc[4][2]; /* N,E,C,J transitions -*/ int *bsc; /* begin transitions [1.M] -*/ int *esc; /* end transitions [1.M] -*/ /* DNA translation scoring parameters * For aligning protein Plan7 models to DNA sequence. * Lookup value for a codon is calculated by pos1 * 16 + pos2 * 4 + pos3, * where 'pos1' is the digitized value of the first nucleotide position; * if any of the positions are ambiguous codes, lookup value 64 is used * (which will generally have a score of zero) * * Only valid if PLAN7_HASDNA is set. */ int **dnam; /* triplet match scores [0.64][1.M] -*/ int **dnai; /* triplet insert scores [0.64][1.M] -*/ int dna2; /* -1 frameshift, doublet emission, M or I -*/ int dna4; /* +1 frameshift, doublet emission, M or I -*/ /* P-value and E-value statistical parameters * Only valid if PLAN7_STATS is set. */ float mu; /* EVD mu +*/ float lambda; /* EVD lambda +*/ int flags; /* bit flags indicating state of HMM, valid data +*/};/* Flags for plan7->flags. * Note: Some models have scores but no probabilities (for instance, * after reading from an HMM save file). Other models have * probabilities but no scores (for instance, during training * or building). Since it costs time to convert either way, * I use PLAN7_HASBITS and PLAN7_HASPROB flags to defer conversion * until absolutely necessary. This means I have to be careful * about keeping these flags set properly when I fiddle a model. */#define PLAN7_HASBITS (1<<0) /* raised if model has log-odds scores */#define PLAN7_DESC (1<<1) /* raised if description exists */#define PLAN7_RF (1<<2) /* raised if #RF annotation available */#define PLAN7_CS (1<<3) /* raised if #CS annotation available */#define PLAN7_XRAY (1<<4) /* raised if structural data available */#define PLAN7_HASPROB (1<<5) /* raised if model has probabilities */#define PLAN7_HASDNA (1<<6) /* raised if protein HMM->DNA seq params set*/#define PLAN7_STATS (1<<7) /* raised if EVD parameters are available */#define PLAN7_MAP (1<<8) /* raised if alignment map is available */#define PLAN7_ACC (1<<9) /* raised if accession number is available */#define PLAN7_GA (1<<10) /* raised if gathering thresholds available */#define PLAN7_TC (1<<11) /* raised if trusted cutoffs available */#define PLAN7_NC (1<<12) /* raised if noise cutoffs available */#define PLAN7_CA (1<<13) /* raised if surface accessibility avail. *//* Indices for special state types, I: used for dynamic programming xmx[][] * mnemonic: eXtra Matrix for B state = XMB */#define XMB 0#define XME 1#define XMC 2#define XMJ 3#define XMN 4/* Indices for special state types, II: used for hmm->xt[] indexing * mnemonic: eXtra Transition for N state = XTN */#define XTN 0#define XTE 1#define XTC 2#define XTJ 3/* Indices for Plan7 main model state transitions. * Used for indexing hmm->t[k][] * mnemonic: Transition from Match to Match = TMM */#define TMM 0#define TMI 1#define TMD 2#define TIM 3#define TII 4#define TDM 5#define TDD 6 /* Indices for extra state transitions * Used for indexing hmm->xt[][]. */#define MOVE 0 /* trNB, trEC, trCT, trJB */#define LOOP 1 /* trNN, trEJ, trCC, trJJ *//* Declaration of Plan7 dynamic programming matrix structure. */struct dpmatrix_s { int **xmx; /* special scores [0.1..N][BECJN] */ int **mmx; /* match scores [0.1..N][0.1..M] */ int **imx; /* insert scores [0.1..N][0.1..M-1.M] */ int **dmx; /* delete scores [0.1..N][0.1..M-1.M] */};/* Declaration of Plan7 shadow matrix structure. * In general, allowed values are STM, STI, etc.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -