⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hmmpostal.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 3 页
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. ************************************************************//* Derived from code developed by Ian Holmes (Sanger Centre and UC Berkeley) * Copyright (C) 1998 Ian Holmes * Distributed under the GNU General Public License */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "structs.h"		/* data structures, macros, #define's   */#include "config.h"		/* compile-time configuration constants */#include "funcs.h"		/* function declarations                */#include "globals.h"		/* alphabet global variables            */#include "squid.h"		/* general sequence analysis library    */static char banner[] = "hmmbuild - build a hidden Markov model from an alignment";static char usage[]  = "\Usage: hmmbuildpost [-options] <alignment file>\n\  Available options are:\n\   -h           : help; print brief help on version and usage\n\   -n <s>       : name; name this HMM <s>\n\   -r <hmmfile> : read HMM from <file> instead of building\n\   -m <hmmfile> : save HMM to <file>\n\   -o <file>    : re-save annotated alignment to <file>\n\   -A           : append; append this HMM to <hmmfile>\n\   -F           : force; allow overwriting of <hmmfile>\n\\n\  Alternative search algorithm styles: (default: hmmls domain alignment)\n\   -f        : multi-hit local (hmmfs style)\n\   -g        : global alignment (hmms style, Needleman/Wunsch)\n\   -s        : local alignment (hmmsw style, Smith/Waterman)\n\";static char experts[] = "\  Optional re-alignment of sequences to model:\n\   --viterbi : standard max-likelihood (Viterbi) algorithm\n\   --optacc  : optimal accuracy algorithm\n\\n\  Alternative model construction strategies: (default: MAP)\n\   --fast    : Krogh/Haussler fast heuristic construction (see --gapmax)\n\   --hand    : manual construction (requires SELEX file, #=RF annotation)\n\\n\  Expert customization of parameters and priors:\n\   --null  <file> : read null (random sequence) model from <file>\n\   --pam <file>   : heuristic PAM-based prior, using BLAST PAM matrix in <file>\n\   --prior <file> : read Dirichlet prior parameters from <file>\n\\n\  Alternative sequence weighting strategies: (default: GSC weights)\n\   --wblosum     : Henikoff simple filter weights (see --idlevel)\n\   --wgsc        : Gerstein/Sonnhammer/Chothia tree weights (default)\n\   --wme         : maximum entropy (ME)\n\   --wvoronoi    : Sibbald/Argos Voronoi weights\n\   --wnone       : don't do any weighting\n\   --noeff       : don't use effective sequence number; just use nseq\n\\n\  Forcing an alphabet: (normally autodetected)\n\   --amino   : override autodetection, assert that seqs are protein\n\   --nucleic : override autodetection, assert that seqs are DNA/RNA\n\\n\  Other expert options:\n\   --archpri <x> : set architecture size prior to <x> {0.85} [0..1]\n\   --binary      : save the model in binary format, not ASCII text\n\   --cfile <file>: save count vectors to <file>\n\   --gapmax <x>  : max fraction of gaps in mat column {0.50} [0..1]\n\   --idlevel <x> : set frac. id level used by eff. nseq and --wblosum {0.62}\n\   --informat <s>: input alignment is in format <s>, not Stockholm\n\   --pamwgt <x>  : set weight on PAM-based prior to <x> {20.}[>=0]\n\   --swentry <x> : set S/W aggregate entry prob. to <x> {0.5}\n\   --swexit <x>  : set S/W aggregate exit prob. to <x>  {0.5}\n\   --verbose     : print a lot of boring information\n\\n";static struct opt_s OPTIONS[] = {  { "-f", TRUE, sqdARG_NONE },  { "-g", TRUE, sqdARG_NONE },   { "-h", TRUE, sqdARG_NONE },   { "-n", TRUE, sqdARG_STRING},    { "-r", TRUE, sqdARG_STRING},  { "-m", TRUE, sqdARG_STRING},  { "-o", TRUE, sqdARG_STRING},  { "-s", TRUE, sqdARG_NONE },   { "-A", TRUE, sqdARG_NONE },  { "-F", TRUE, sqdARG_NONE },  { "--amino",   FALSE, sqdARG_NONE  },  { "--archpri", FALSE, sqdARG_FLOAT },   { "--binary",  FALSE, sqdARG_NONE  },   { "--cfile",   FALSE, sqdARG_STRING},  { "--fast",    FALSE, sqdARG_NONE},  { "--gapmax",  FALSE, sqdARG_FLOAT },  { "--hand",    FALSE, sqdARG_NONE},  { "--idlevel", FALSE, sqdARG_FLOAT },  { "--informat",FALSE, sqdARG_STRING },  { "--noeff",   FALSE, sqdARG_NONE },  { "--nucleic", FALSE, sqdARG_NONE },  { "--null",    FALSE, sqdARG_STRING },  { "--optacc",  FALSE, sqdARG_NONE  },  { "--pam",     FALSE, sqdARG_STRING },  { "--pamwgt",  FALSE, sqdARG_FLOAT },  { "--prior",   FALSE, sqdARG_STRING },  { "--swentry", FALSE, sqdARG_FLOAT },  { "--swexit",  FALSE, sqdARG_FLOAT },  { "--verbose", FALSE, sqdARG_NONE  },  { "--viterbi", FALSE, sqdARG_NONE  },  { "--wgsc",    FALSE, sqdARG_NONE },  { "--wblosum", FALSE, sqdARG_NONE },  { "--wme",     FALSE, sqdARG_NONE },  { "--wnone",   FALSE, sqdARG_NONE },  { "--wvoronoi",FALSE, sqdARG_NONE },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))static void save_model(struct plan7_s *hmm, char *hmmfile, int do_append, int do_binary);static void print_all_scores(FILE *fp, struct plan7_s *hmm, 			     AINFO *ainfo, char **dsq, int nseq, 			     struct p7trace_s **tr);static void save_countvectors(char *cfile, struct plan7_s *hmm);static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt,				   int nseq, struct p7trace_s **tr, float *pernode,				   float *ret_avg);static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, 			      float *pernode, float expected);static void maximum_entropy(struct plan7_s *hmm, char **dsq, AINFO *ainfo, 			    int nseq, float eff_nseq, 			    struct p7prior_s *prior, struct p7trace_s **tr);extern void Postcode(int L, struct dpmatrix_s *mx, struct p7trace_s *tr);intmain(int argc, char **argv) {  char            *seqfile;     /* seqfile to read alignment from          */   int              format;	/* format of seqfile                       */  MSAFILE         *afp;         /* open alignment file                     */  MSA             *msa;         /* a multiple sequence alignment           */  char           **dsq;         /* digitized unaligned aseq's              */   struct plan7_s  *hmm;         /* constructed HMM; written to hmmfile     */  struct p7prior_s *pri;        /* Dirichlet priors to use                 */  struct p7trace_s **tr;        /* fake tracebacks for aseq's              */   char            *readfile;    /* file to read HMM from                   */  HMMFILE         *hmmfp;       /* opened hmmfile for reading              */  char            *hmmfile;     /* file to write HMM to                    */  FILE            *fp;          /* OUTPUT file handle (misc.)              */  char            *name;        /* name of the HMM                         */  int              idx;		/* counter for sequences                   */  float  randomseq[MAXABET];	/* null sequence model                     */  float            p1;		/* null sequence model p1 transition       */  char *optname;                /* name of option found by Getopt()      */  char *optarg;                 /* argument found by Getopt()            */  int   optind;                 /* index in argv[]                       */  enum p7_construction c_strategy; /* construction strategy choice        */  enum p7_weight {		/* weighting strategy */    WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_VORONOI, WGT_ME} w_strategy;  enum p7_config {              /* algorithm configuration strategy      */    P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy;  float gapmax;			/* max frac gaps in mat col for -k       */  int   overwrite_protect;	/* TRUE to prevent overwriting HMM file  */  enum  realignment_strategy {  /* re-alignment strategy                 */    REALIGN_NONE, REALIGN_VITERBI, REALIGN_OPTACC } r_strategy;  int   verbose;		/* TRUE to show a lot of output          */  char *align_ofile;            /* name of output alignment file         */  char *rndfile;		/* random sequence model file to read    */  char *prifile;		/* Dirichlet prior file to read          */  char *pamfile;		/* PAM matrix file for heuristic prior   */  char *cfile;			/* output file for count vectors         */  float archpri;		/* "architecture" prior on model size    */  float pamwgt;			/* weight on PAM for heuristic prior     */  int   do_append;		/* TRUE to append to hmmfile             */  int   do_binary;		/* TRUE to write in binary format        */  float blosumlevel;		/* BLOSUM frac id filtering level [0.62] */  float swentry;		/* S/W aggregate entry probability       */  float swexit;			/* S/W aggregate exit probability        */  int   do_eff;			/* TRUE to set an effective seq number   */  float eff_nseq;		/* effective sequence number             */  int   checksum;  int   len;  struct dpmatrix_s *forward_mx;   /* Forward matrix                        */  struct dpmatrix_s *backward_mx;  /* Backward matrix                       */  struct dpmatrix_s *posterior_mx; /* Posterior matrix                      */  struct dpmatrix_s *optacc_mx;    /* Optimal accuracy matrix               */  /***********************************************    * Parse command line   ***********************************************/    format            = MSAFILE_UNKNOWN;  c_strategy        = P7_MAP_CONSTRUCTION;  w_strategy        = WGT_GSC;  blosumlevel       = 0.62;  cfg_strategy      = P7_LS_CONFIG;  gapmax            = 0.5;  overwrite_protect = TRUE;  r_strategy        = REALIGN_NONE;  verbose           = FALSE;  readfile          = NULL;  hmmfile           = NULL;  align_ofile       = NULL;  rndfile           = NULL;  prifile           = NULL;  pamfile           = NULL;  cfile             = NULL;  archpri           = 0.85;  pamwgt            = 20.;  Alphabet_type     = hmmNOTSETYET;	/* initially unknown */  name              = NULL;  do_append         = FALSE;   swentry           = 0.5;  swexit            = 0.5;  do_eff            = TRUE;  do_binary         = FALSE;    while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,                &optind, &optname, &optarg))  {    if      (strcmp(optname, "-f") == 0) cfg_strategy      = P7_FS_CONFIG;    else if (strcmp(optname, "-g") == 0) cfg_strategy      = P7_BASE_CONFIG;    else if (strcmp(optname, "-n") == 0) name              = Strdup(optarg);     else if (strcmp(optname, "-r") == 0) readfile          = optarg;    else if (strcmp(optname, "-m") == 0) hmmfile           = optarg;    else if (strcmp(optname, "-o") == 0) align_ofile       = optarg;    else if (strcmp(optname, "-r") == 0) rndfile           = optarg;    else if (strcmp(optname, "-s") == 0) cfg_strategy      = P7_SW_CONFIG;    else if (strcmp(optname, "-A") == 0) do_append         = TRUE;     else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE;    else if (strcmp(optname, "--amino")   == 0) SetAlphabet(hmmAMINO);    else if (strcmp(optname, "--archpri") == 0) archpri       = atof(optarg);    else if (strcmp(optname, "--binary")  == 0) do_binary     = TRUE;    else if (strcmp(optname, "--cfile")   == 0) cfile         = optarg;    else if (strcmp(optname, "--fast")    == 0) c_strategy    = P7_FAST_CONSTRUCTION;    else if (strcmp(optname, "--hand")    == 0) c_strategy    = P7_HAND_CONSTRUCTION;    else if (strcmp(optname, "--gapmax")  == 0) gapmax        = atof(optarg);    else if (strcmp(optname, "--idlevel") == 0) blosumlevel   = atof(optarg);    else if (strcmp(optname, "--noeff")   == 0) do_eff        = FALSE;    else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC);    else if (strcmp(optname, "--optacc")  == 0) r_strategy    = REALIGN_OPTACC;    else if (strcmp(optname, "--pam")     == 0) pamfile       = optarg;    else if (strcmp(optname, "--pamwgt")  == 0) pamwgt        = atof(optarg);    else if (strcmp(optname, "--prior")   == 0) prifile       = optarg;    else if (strcmp(optname, "--swentry") == 0) swentry       = atof(optarg);     else if (strcmp(optname, "--swexit")  == 0) swexit        = atof(optarg);     else if (strcmp(optname, "--verbose") == 0) verbose       = TRUE;    else if (strcmp(optname, "--viterbi") == 0) r_strategy    = REALIGN_VITERBI;    else if (strcmp(optname, "--wgsc")    == 0) w_strategy    = WGT_GSC;    else if (strcmp(optname, "--wblosum") == 0) w_strategy    = WGT_BLOSUM;     else if (strcmp(optname, "--wme")     == 0) w_strategy    = WGT_ME;      else if (strcmp(optname, "--wnone")   == 0) w_strategy    = WGT_NONE;     else if (strcmp(optname, "--wvoronoi")== 0) w_strategy    = WGT_VORONOI;    else if (strcmp(optname, "--informat") == 0) {      format = String2SeqfileFormat(optarg);      if (format == MSAFILE_UNKNOWN) 	Die("unrecognized sequence file format \"%s\"", optarg);      if (! IsAlignmentFormat(format))	Die("%s is an unaligned format, can't read as an alignment", optarg);    }    else if (strcmp(optname, "-h") == 0) {      Banner(stdout, banner);      puts(usage);      puts(experts);      exit(0);    }  }  if (argc - optind != 1)    Die("Incorrect number of arguments.\n%s\n", usage);  seqfile = argv[optind++];   if (readfile != NULL && r_strategy == REALIGN_NONE)    r_strategy = REALIGN_VITERBI;  if (gapmax < 0. || gapmax > 1.)     Die("--gapmax must be a value from 0 to 1\n%s\n", usage);  if (archpri < 0. || archpri > 1.)    Die("--archpri must be a value from 0 to 1\n%s\n", usage);  if (overwrite_protect && hmmfile && !do_append && FileExists(hmmfile))    Die("HMM file %s already exists. Rename or delete it.", hmmfile);   if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile))    Die("Alignment resave file %s exists. Rename or delete it.", align_ofile);   /***********************************************    * Get sequence data   ***********************************************/				/* Open the alignment */  if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL)    Die("Alignment file %s could not be opened for reading", seqfile);                                /* read the alignment from file */  if ((msa = MSAFileRead(afp)) == NULL)     Die("Failed to read aligned sequence file %s", seqfile);  for (idx = 0; idx < msa->nseq; idx++)    s2upper(msa->aseq[idx]);  MSAFileClose(afp);				/* Set up the alphabet globals */  if (Alphabet_type == hmmNOTSETYET)     DetermineAlphabet(msa->aseq, msa->nseq);				/* Set up Dirichlet priors */  if (prifile == NULL)  pri = P7DefaultPrior();  else                  pri = P7ReadPrior(prifile);  if (pamfile != NULL)  PAMPrior(pamfile, pri, pamwgt);				/* Set up the null/random seq model */  if (rndfile == NULL)  P7DefaultNullModel(randomseq, &p1);  else                  P7ReadNullModel(rndfile, randomseq, &p1);				/* Prepare sequences for internal use */  DigitizeAlignment(msa, &dsq);  				/* In some respects we treat DNA more crudely... */  if (Alphabet_type == hmmNUCLEIC)    {      do_eff     = FALSE;	/* don't do effective seq #; it's calibrated for protein */    } /***********************************************   * Either read in an HMM or build from alignment,  * depending on user specifications.  ***********************************************/  if (readfile != NULL) {    /***********************************************      * Open HMM file (might be in HMMERDB or current directory).     * Read a single HMM from it.     ***********************************************/        if ((hmmfp = HMMFileOpen(readfile, "HMMERDB")) == NULL)      Die("Failed to open HMM file %s\n%s", readfile, usage);    if (!HMMFileRead(hmmfp, &hmm))       Die("Failed to read any HMMs from %s\n", readfile);    HMMFileClose(hmmfp);    if (hmm == NULL)       Die("HMM file %s corrupt or in incorrect format? Parse failed", readfile);    tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * msa->nseq);    for (idx = 0; idx < msa->nseq; idx++)      tr[idx] = 0;  } else {    /***********************************************      * Build an HMM     ***********************************************/        /* Determine the effective sequence number to use (optional)     */    eff_nseq = (float) msa->nseq;    if (do_eff)      {	float *wgt;	printf("%-40s ... ", "Determining effective sequence number");	fflush(stdout);				/* dummy weights array to feed BlosumWeights*/	wgt = MallocOrDie(sizeof(float) * msa->nseq);	BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt);	eff_nseq = FSum(wgt, msa->nseq);	free(wgt);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -