📄 prior.c
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. *****************************************************************//* prior.c * SRE, Mon Nov 18 15:44:08 1996 * * Support for Dirichlet prior data structure, p7prior_s. */#include "config.h"#include "structs.h"#include "funcs.h" #include "squid.h"#ifdef MEMDEBUG#include "dbmalloc.h"#endifstatic struct p7prior_s *default_amino_prior(void);static struct p7prior_s *default_nucleic_prior(void);/* Function: P7AllocPrior(), P7FreePrior() * * Purpose: Allocation and free'ing of a prior structure. * Very simple, but might get more complex someday. */struct p7prior_s *P7AllocPrior(void){ return (struct p7prior_s *) MallocOrDie (sizeof(struct p7prior_s)); }voidP7FreePrior(struct p7prior_s *pri){ free(pri); }/* Function: P7LaplacePrior() * * Purpose: Create a Laplace plus-one prior. (single component Dirichlets). * Global alphabet info is assumed to have been set already. * * Args: (void) * * Return: prior. Allocated here; call FreePrior() to free it. */ struct p7prior_s *P7LaplacePrior(void){ struct p7prior_s *pri; pri = P7AllocPrior(); pri->strategy = PRI_DCHLET; pri->tnum = 1; pri->tq[0] = 1.; FSet(pri->t[0], 8, 1.); pri->mnum = 1; pri->mq[0] = 1.; FSet(pri->m[0], Alphabet_size, 1.); pri->inum = 1; pri->iq[0] = 1.; FSet(pri->i[0], Alphabet_size, 1.); return pri;}/* Function: P7DefaultPrior() * * Purpose: Set up a somewhat more realistic single component * Dirichlet prior than Laplace. */ struct p7prior_s *P7DefaultPrior(void){ switch (Alphabet_type) { case hmmAMINO: return default_amino_prior(); case hmmNUCLEIC: return default_nucleic_prior(); case hmmNOTSETYET: Die("Can't set prior; alphabet type not set yet"); } /*NOTREACHED*/ return NULL;}/* Function: P7ReadPrior() * * Purpose: Input a prior from disk file. */struct p7prior_s *P7ReadPrior(char *prifile) { FILE *fp; struct p7prior_s *pri; char *sptr; int q, x; if ((fp = fopen(prifile, "r")) == NULL) Die("Failed to open HMMER prior file %s\n", prifile); pri = P7AllocPrior(); /* First entry is the strategy: * Only standard Dirichlet prior (simple or mixture) is supported in Plan7 so far */ sptr = Getword(fp, sqdARG_STRING); s2upper(sptr); if (strcmp(sptr, "DIRICHLET") == 0) pri->strategy = PRI_DCHLET; else Die("No such prior strategy %s; failed to parse file %s", sptr, prifile); /* Second entry is the alphabet type: * Amino or Nucleic */ sptr = Getword(fp, sqdARG_STRING); s2upper(sptr); if (strcmp(sptr, "AMINO") == 0) { if (Alphabet_type != hmmAMINO) Die("HMM and/or sequences are DNA/RNA; can't use protein prior %s", prifile); } else if (strcmp(sptr, "NUCLEIC") == 0) { if (Alphabet_type != hmmNUCLEIC) Die("HMM and/or sequences are protein; can't use DNA/RNA prior %s", prifile); } else Die("Alphabet \"%s\" in prior file %s isn't valid.", sptr, prifile); /* State transition priors: * # of mixtures. * then for each mixture: * prior P(q) * Dirichlet terms for Tmm, Tmi, Tmd, Tim, Tii, Tid, Tdm, Tdi, Tdd */ pri->tnum = atoi(Getword(fp, sqdARG_INT)); if (pri->tnum < 0) Die("%d is bad; need at least one state transition mixture component", pri->tnum); if (pri->tnum > MAXDCHLET) Die("%d is bad, too many transition components (MAXDCHLET = %d)\n", MAXDCHLET); for (q = 0; q < pri->tnum; q++) { pri->tq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); for (x = 0; x < 7; x++) pri->t[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); } /* Match emission priors: * # of mixtures. * then for each mixture: * prior P(q) * Dirichlet terms for Alphabet_size symbols in Alphabet */ pri->mnum = atoi(Getword(fp, sqdARG_INT)); if (pri->mnum < 0) Die("%d is bad; need at least one match emission mixture component", pri->mnum); if (pri->mnum > MAXDCHLET) Die("%d is bad; too many match components (MAXDCHLET = %d)\n", pri->mnum, MAXDCHLET); for (q = 0; q < pri->mnum; q++) { pri->mq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); for (x = 0; x < Alphabet_size; x++) pri->m[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); } /* Insert emission priors: * # of mixtures. * then for each mixture component: * prior P(q) * Dirichlet terms for Alphabet_size symbols in Alphabet */ pri->inum = atoi(Getword(fp, sqdARG_INT)); if (pri->inum < 0) Die("%d is bad; need at least one insert emission mixture component", pri->inum); if (pri->inum > MAXDCHLET) Die("%d is bad; too many insert components (MAXDCHLET = %d)\n", pri->inum, MAXDCHLET); for (q = 0; q < pri->inum; q++) { pri->iq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); for (x = 0; x < Alphabet_size; x++) pri->i[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); } fclose(fp); return pri;}/* Function: PAMPrior() * * Purpose: Produces an ad hoc "Dirichlet mixture" prior for * match emissions, using a PAM matrix. * * Side effect notice: PAMPrior() replaces the match * emission section of an existing Dirichlet prior, * which is /expected/ to be a simple one-component * kind of prior. The insert emissions /must/ be a * one-component prior (because of details in how * PriorifyEmissionVector() is done). However, * the transitions /could/ be a mixture Dirichlet prior * without causing problems. In other words, the * -p and -P options of hmmb can coexist, but there * may be conflicts. PAMPrior() checks for these, * so there's no serious problem, except that the * error message from PAMPrior() might be confusing to * a user. */voidPAMPrior(char *pamfile, struct p7prior_s *pri, float wt){ FILE *fp; char *blastpamfile; /* BLAST looks in aa/ subdirectory of BLASTMAT */ int **pam; float scale; int xi, xj; int idx1, idx2; if (Alphabet_type != hmmAMINO) Die("PAM prior is only valid for protein sequences"); if (pri->strategy != PRI_DCHLET) Die("PAM prior may only be applied over an existing Dirichlet prior"); if (pri->inum != 1) Die("PAM prior requires that the insert emissions be a single Dirichlet"); if (MAXDCHLET < 20) Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior"); blastpamfile = FileConcat("aa", pamfile); if ((fp = fopen(pamfile, "r")) == NULL && (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL && (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL) Die("Failed to open PAM scoring matrix file %s", pamfile); if (! ParsePAMFile(fp, &pam, &scale)) Die("Failed to parse PAM scoring matrix file %s", pamfile); fclose(fp); free(blastpamfile); pri->strategy = PRI_PAM; pri->mnum = 20; /* Convert PAM entries back to conditional prob's P(xj | xi), * which we'll use as "pseudocounts" weighted by wt. */ for (xi = 0; xi < Alphabet_size; xi++) for (xj = 0; xj < Alphabet_size; xj++) { idx1 = Alphabet[xi] - 'A'; idx2 = Alphabet[xj] - 'A'; pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale); } /* Normalize so that rows add up to wt. * i.e. Sum(xj) mat[xi][xj] = wt for every row xi */ for (xi = 0; xi < Alphabet_size; xi++) { pri->mq[xi] = 1. / Alphabet_size; FNorm(pri->m[xi], Alphabet_size); FScale(pri->m[xi], Alphabet_size, wt); } Free2DArray((void **)pam,27);}/* Function: P7DefaultNullModel() * * Purpose: Set up a default random sequence model, using * global aafq[]'s for protein or 1/Alphabet_size for anything * else. randomseq is alloc'ed in caller. Alphabet information * must already be known. */voidP7DefaultNullModel(float *null, float *ret_p1){ int x; if (Alphabet_type == hmmAMINO) { for (x = 0; x < Alphabet_size; x++) null[x] = aafq[x]; *ret_p1 = 350./351.; /* rationale: approx avg protein length. */ } else { for (x = 0; x < Alphabet_size; x++) null[x] = 1.0 / (float) Alphabet_size; *ret_p1 = 1000./1001.; /* rationale: approx inter-Alu distance. */ }}voidP7ReadNullModel(char *rndfile, float *null, float *ret_p1){ FILE *fp; char *s; int x; int type = 0; if ((fp = fopen(rndfile, "r")) == NULL) Die("Failed to open null model file %s\n", rndfile); if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; s2upper(s); if (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC; else if (strcmp(s, "AMINO") == 0) type = hmmAMINO; else goto FAILURE; /* check/set alphabet type */ if (Alphabet_type == 0) SetAlphabet(type); else if (Alphabet_type != type) Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile); /* parse the file */ for (x = 0; x < Alphabet_size; x++) { if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; null[x] = atof(s); } if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; *ret_p1 = atof(s); fclose(fp); return;FAILURE: fclose(fp); Die("%s is not in HMMER null model file format", rndfile);}/* Function: P7PriorifyHMM() * * Purpose: Add pseudocounts to an HMM using Dirichlet priors, * and renormalize the HMM. * * Args: hmm -- the HMM to add counts to (counts form) * pri -- the Dirichlet prior to use * * Return: (void) * HMM returns in probability form. */ voidP7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri){ int k; /* counter for model position */ float d; /* a denominator */ float tq[MAXDCHLET]; /* prior distribution over mixtures */ float mq[MAXDCHLET]; /* prior distribution over mixtures */ float iq[MAXDCHLET]; /* prior distribution over mixtures */ /* Model-dependent transitions are handled simply; Laplace. */ FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */ FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */ d = hmm->tbd1 + hmm->begin[1] + 2.; hmm->tbd1 = (hmm->tbd1 + 1.)/ d; hmm->begin[1] = (hmm->begin[1] + 1.)/ d; hmm->end[hmm->M] = 1.0; /* Main model transitions and emissions */ for (k = 1; k < hmm->M; k++) { /* The following code chunk is experimental. * Collaboration with Michael Asman, Erik Sonnhammer, CGR Stockholm.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -