📄 prior.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* prior.c * SRE, Mon Nov 18 15:44:08 1996 *  * Support for Dirichlet prior data structure, p7prior_s. */#include "config.h"#include "structs.h"#include "funcs.h" #include "squid.h"#ifdef MEMDEBUG#include "dbmalloc.h"#endifstatic struct p7prior_s *default_amino_prior(void);static struct p7prior_s *default_nucleic_prior(void);/* Function: P7AllocPrior(), P7FreePrior() *  * Purpose:  Allocation and free'ing of a prior structure. *           Very simple, but might get more complex someday. */struct p7prior_s *P7AllocPrior(void){ return (struct p7prior_s *) MallocOrDie (sizeof(struct p7prior_s)); }voidP7FreePrior(struct p7prior_s *pri){ free(pri); }/* Function: P7LaplacePrior() *  * Purpose:  Create a Laplace plus-one prior. (single component Dirichlets).  *           Global alphabet info is assumed to have been set already. * * Args:     (void) * * Return:   prior. Allocated here; call FreePrior() to free it. */ struct p7prior_s *P7LaplacePrior(void){  struct p7prior_s *pri;    pri = P7AllocPrior();  pri->strategy = PRI_DCHLET;  pri->tnum     = 1;  pri->tq[0]    = 1.;  FSet(pri->t[0], 8, 1.);     pri->mnum  = 1;  pri->mq[0] = 1.;  FSet(pri->m[0], Alphabet_size, 1.);  pri->inum  = 1;  pri->iq[0] = 1.;  FSet(pri->i[0], Alphabet_size, 1.);  return pri;}/* Function: P7DefaultPrior() *  * Purpose:  Set up a somewhat more realistic single component *           Dirichlet prior than Laplace. */ struct p7prior_s *P7DefaultPrior(void){  switch (Alphabet_type) {  case hmmAMINO:     return default_amino_prior();  case hmmNUCLEIC:   return default_nucleic_prior();  case hmmNOTSETYET: Die("Can't set prior; alphabet type not set yet");  }  /*NOTREACHED*/  return NULL;}/* Function: P7ReadPrior() *  * Purpose:  Input a prior from disk file. */struct p7prior_s *P7ReadPrior(char *prifile) {  FILE             *fp;  struct p7prior_s *pri;  char             *sptr;  int               q, x;  if ((fp = fopen(prifile, "r")) == NULL)    Die("Failed to open HMMER prior file %s\n", prifile);  pri = P7AllocPrior();  /* First entry is the strategy:    * Only standard Dirichlet prior (simple or mixture) is supported in Plan7 so far   */  sptr = Getword(fp, sqdARG_STRING);  s2upper(sptr);  if      (strcmp(sptr, "DIRICHLET") == 0) pri->strategy = PRI_DCHLET;  else Die("No such prior strategy %s; failed to parse file %s", sptr, prifile);  /* Second entry is the alphabet type:   * Amino or Nucleic   */  sptr = Getword(fp, sqdARG_STRING);  s2upper(sptr);  if (strcmp(sptr, "AMINO") == 0)    {       if (Alphabet_type != hmmAMINO)	Die("HMM and/or sequences are DNA/RNA; can't use protein prior %s", prifile);    }  else if (strcmp(sptr, "NUCLEIC") == 0)    {      if (Alphabet_type != hmmNUCLEIC)	Die("HMM and/or sequences are protein; can't use DNA/RNA prior %s", prifile);    }  else     Die("Alphabet \"%s\" in prior file %s isn't valid.", sptr, prifile);  /* State transition priors:   * # of mixtures.   * then for each mixture:   *    prior P(q)   *    Dirichlet terms for Tmm, Tmi, Tmd, Tim, Tii, Tid, Tdm, Tdi, Tdd   */  pri->tnum = atoi(Getword(fp, sqdARG_INT));  if (pri->tnum < 0)    Die("%d is bad; need at least one state transition mixture component", pri->tnum);  if (pri->tnum > MAXDCHLET)    Die("%d is bad, too many transition components (MAXDCHLET = %d)\n", MAXDCHLET);  for (q = 0; q < pri->tnum; q++)    {      pri->tq[q]    = (float) atof(Getword(fp, sqdARG_FLOAT));      for (x = 0; x < 7; x++) 	pri->t[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));    }  /* Match emission priors:   * # of mixtures.   * then for each mixture:   *    prior P(q)   *    Dirichlet terms for Alphabet_size symbols in Alphabet   */  pri->mnum = atoi(Getword(fp, sqdARG_INT));  if (pri->mnum < 0)    Die("%d is bad; need at least one match emission mixture component", pri->mnum);  if (pri->mnum > MAXDCHLET)    Die("%d is bad; too many match components (MAXDCHLET = %d)\n", pri->mnum, MAXDCHLET);  for (q = 0; q < pri->mnum; q++)    {      pri->mq[q] = (float) atof(Getword(fp, sqdARG_FLOAT));      for (x = 0; x < Alphabet_size; x++) 	pri->m[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));    }    /* Insert emission priors:   * # of mixtures.   * then for each mixture component:   *    prior P(q)   *    Dirichlet terms for Alphabet_size symbols in Alphabet   */  pri->inum = atoi(Getword(fp, sqdARG_INT));  if (pri->inum < 0)    Die("%d is bad; need at least one insert emission mixture component", pri->inum);  if (pri->inum > MAXDCHLET)    Die("%d is bad; too many insert components (MAXDCHLET = %d)\n", pri->inum,  MAXDCHLET);  for (q = 0; q < pri->inum; q++)    {      pri->iq[q]  = (float) atof(Getword(fp, sqdARG_FLOAT));      for (x = 0; x < Alphabet_size; x++) 	pri->i[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));    }  fclose(fp);  return pri;}/* Function: PAMPrior() *  * Purpose:  Produces an ad hoc "Dirichlet mixture" prior for *           match emissions, using a PAM matrix.  *            *           Side effect notice: PAMPrior() replaces the match *           emission section of an existing Dirichlet prior, *           which is /expected/ to be a simple one-component  *           kind of prior. The insert emissions /must/ be a *           one-component prior (because of details in how  *           PriorifyEmissionVector() is done). However,  *           the transitions /could/ be a mixture Dirichlet prior  *           without causing problems. In other words, the *           -p and -P options of hmmb can coexist, but there *           may be conflicts. PAMPrior() checks for these, *           so there's no serious problem, except that the *           error message from PAMPrior() might be confusing to *           a user.  */voidPAMPrior(char *pamfile, struct p7prior_s *pri, float wt){  FILE  *fp;  char  *blastpamfile;            /* BLAST looks in aa/ subdirectory of BLASTMAT */  int  **pam;  float  scale;  int    xi, xj;  int    idx1, idx2;  if (Alphabet_type != hmmAMINO)    Die("PAM prior is only valid for protein sequences");  if (pri->strategy != PRI_DCHLET)    Die("PAM prior may only be applied over an existing Dirichlet prior");  if (pri->inum != 1)    Die("PAM prior requires that the insert emissions be a single Dirichlet");  if (MAXDCHLET < 20)    Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior");  blastpamfile = FileConcat("aa", pamfile);  if ((fp = fopen(pamfile, "r")) == NULL &&      (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL &&      (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL)    Die("Failed to open PAM scoring matrix file %s", pamfile);  if (! ParsePAMFile(fp, &pam, &scale))    Die("Failed to parse PAM scoring matrix file %s", pamfile);  fclose(fp);  free(blastpamfile);  pri->strategy = PRI_PAM;  pri->mnum     = 20;    /* Convert PAM entries back to conditional prob's P(xj | xi),   * which we'll use as "pseudocounts" weighted by wt.   */  for (xi = 0; xi < Alphabet_size; xi++)    for (xj = 0; xj < Alphabet_size; xj++)      {        idx1 = Alphabet[xi] - 'A';        idx2 = Alphabet[xj] - 'A';        pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale);      }    /* Normalize so that rows add up to wt.   * i.e. Sum(xj) mat[xi][xj] = wt for every row xi   */  for (xi = 0; xi < Alphabet_size; xi++)    {      pri->mq[xi] = 1. / Alphabet_size;      FNorm(pri->m[xi], Alphabet_size);      FScale(pri->m[xi], Alphabet_size, wt);    }  Free2DArray((void **)pam,27);}/* Function: P7DefaultNullModel() *  * Purpose:  Set up a default random sequence model, using *           global aafq[]'s for protein or 1/Alphabet_size for anything *           else. randomseq is alloc'ed in caller. Alphabet information *           must already be known. */voidP7DefaultNullModel(float *null, float *ret_p1){  int x;  if (Alphabet_type == hmmAMINO) {    for (x = 0; x < Alphabet_size; x++)      null[x] = aafq[x];    *ret_p1 = 350./351.;	/* rationale: approx avg protein length. */  } else {    for (x = 0; x < Alphabet_size; x++)      null[x] = 1.0 / (float) Alphabet_size;    *ret_p1 = 1000./1001.;	/* rationale: approx inter-Alu distance. */  }}voidP7ReadNullModel(char *rndfile, float *null, float *ret_p1){  FILE *fp;  char *s;  int   x;  int   type = 0;   if ((fp = fopen(rndfile, "r")) == NULL)    Die("Failed to open null model file %s\n", rndfile);  if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;  s2upper(s);  if      (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC;  else if (strcmp(s, "AMINO")   == 0) type = hmmAMINO;  else    goto FAILURE;				/* check/set alphabet type */  if (Alphabet_type == 0)     SetAlphabet(type);  else if (Alphabet_type != type)    Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile);				/* parse the file */  for (x = 0; x < Alphabet_size; x++) {    if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;    null[x] = atof(s);  }  if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;  *ret_p1 = atof(s);  fclose(fp);  return;FAILURE:  fclose(fp);  Die("%s is not in HMMER null model file format", rndfile);}/* Function: P7PriorifyHMM() *  * Purpose:  Add pseudocounts to an HMM using Dirichlet priors, *           and renormalize the HMM. *  * Args:     hmm -- the HMM to add counts to (counts form) *           pri -- the Dirichlet prior to use *            * Return:   (void) *           HMM returns in probability form. */          voidP7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri){  int k;			/* counter for model position   */  float d;			/* a denominator */  float tq[MAXDCHLET];		/* prior distribution over mixtures */  float mq[MAXDCHLET];		/* prior distribution over mixtures */  float iq[MAXDCHLET];		/* prior distribution over mixtures */  /* Model-dependent transitions are handled simply; Laplace.   */  FSet(hmm->begin+2, hmm->M-1, 0.);     /* wipe internal BM entries */  FSet(hmm->end+1, hmm->M-1, 0.);	/* wipe internal ME exits   */  d = hmm->tbd1 + hmm->begin[1] + 2.;  hmm->tbd1        = (hmm->tbd1 + 1.)/ d;  hmm->begin[1]    = (hmm->begin[1] + 1.)/ d;  hmm->end[hmm->M] = 1.0;  /* Main model transitions and emissions   */  for (k = 1; k < hmm->M; k++)    {      /* The following code chunk is experimental.        * Collaboration with Michael Asman, Erik Sonnhammer, CGR Stockholm.
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -