⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hmmbuild.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 3 页
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. ************************************************************//* hmmbuild.c * SRE, Mon Nov 18 12:41:29 1996 * * main() for HMM construction from an alignment. * CVS $Id: hmmbuild.c,v 1.23 2000/12/24 22:55:18 eddy Exp $ */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "structs.h"		/* data structures, macros, #define's   */#include "config.h"		/* compile-time configuration constants */#include "funcs.h"		/* function declarations                */#include "globals.h"		/* alphabet global variables            */#include "squid.h"		/* general sequence analysis library    */#include "msa.h"                /* squid's multiple alignment i/o       */static char banner[] = "hmmbuild - build a hidden Markov model from an alignment";static char usage[]  = "\Usage: hmmbuild [-options] <hmmfile output> <alignment file>\n\  Available options are:\n\   -h     : help; print brief help on version and usage\n\   -n <s> : name; name this (first) HMM <s>\n\   -o <f> : re-save annotated alignment to <f>\n\   -A     : append; append this HMM to <hmmfile>\n\   -F     : force; allow overwriting of <hmmfile>\n\\n\  Alternative search algorithm styles: (default: hmmls domain alignment)\n\   -f     : multi-hit local (hmmfs style)\n\   -g     : global alignment (hmms style, Needleman/Wunsch)\n\   -s     : local alignment (hmmsw style, Smith/Waterman)\n\";static char experts[] = "\  Alternative model construction strategies: (default: MAP)\n\   --fast        : Krogh/Haussler fast heuristic construction (see --gapmax)\n\   --hand        : manual construction (requires annotated alignment)\n\\n\  Expert customization of parameters and priors:\n\   --null  <f>   : read null (random sequence) model from <f>\n\   --pam   <f>   : heuristic PAM-based prior, using BLAST PAM matrix in <f>\n\   --prior <f>   : read Dirichlet prior parameters from <f>\n\\n\  Alternative sequence weighting strategies: (default: GSC weights)\n\   --wblosum     : Henikoff simple filter weights (see --idlevel)\n\   --wgsc        : Gerstein/Sonnhammer/Chothia tree weights (default)\n\   --wme         : maximum entropy (ME)\n\   --wpb         : Henikoff position-based weights\n\   --wvoronoi    : Sibbald/Argos Voronoi weights\n\   --wnone       : don't do any weighting\n\   --noeff       : don't use effective sequence number; just use nseq\n\   --pbswitch <n>: set switch from GSC to position-based wgts at > n seqs\n\\n\  Forcing an alphabet: (normally autodetected)\n\   --amino   : override autodetection, assert that seqs are protein\n\   --nucleic : override autodetection, assert that seqs are DNA/RNA\n\\n\  Other expert options:\n\   --archpri <x> : set architecture size prior to <x> {0.85} [0..1]\n\   --binary      : save the model in binary format, not ASCII text\n\   --cfile <f>   : save count vectors to <f>\n\   --gapmax <x>  : max fraction of gaps in mat column {0.50} [0..1]\n\   --idlevel <x> : set frac. id level used by eff. nseq and --wblosum {0.62}\n\   --informat <s>: input alignment is in format <s>, not Stockholm\n\   --pamwgt <x>  : set weight on PAM-based prior to <x> {20.}[>=0]\n\   --swentry <x> : set S/W aggregate entry prob. to <x> {0.5}\n\   --swexit <x>  : set S/W aggregate exit prob. to <x>  {0.5}\n\   --verbose     : print boring information\n\\n";static struct opt_s OPTIONS[] = {  { "-f", TRUE, sqdARG_NONE },  { "-g", TRUE, sqdARG_NONE },   { "-h", TRUE, sqdARG_NONE },   { "-n", TRUE, sqdARG_STRING},    { "-o", TRUE, sqdARG_STRING},  { "-s", TRUE, sqdARG_NONE },   { "-A", TRUE, sqdARG_NONE },  { "-F", TRUE, sqdARG_NONE },  { "--amino",   FALSE, sqdARG_NONE  },  { "--archpri", FALSE, sqdARG_FLOAT },   { "--binary",  FALSE, sqdARG_NONE  },   { "--cfile",   FALSE, sqdARG_STRING},  { "--fast",    FALSE, sqdARG_NONE},  { "--gapmax",  FALSE, sqdARG_FLOAT },  { "--hand",    FALSE, sqdARG_NONE},  { "--idlevel", FALSE, sqdARG_FLOAT },  { "--informat",FALSE, sqdARG_STRING },  { "--noeff",   FALSE, sqdARG_NONE },  { "--nucleic", FALSE, sqdARG_NONE },  { "--null",    FALSE, sqdARG_STRING },  { "--pam",     FALSE, sqdARG_STRING },  { "--pamwgt",  FALSE, sqdARG_FLOAT },  { "--pbswitch",FALSE, sqdARG_INT },  { "--prior",   FALSE, sqdARG_STRING },  { "--swentry", FALSE, sqdARG_FLOAT },  { "--swexit",  FALSE, sqdARG_FLOAT },  { "--verbose", FALSE, sqdARG_NONE  },  { "--wgsc",    FALSE, sqdARG_NONE },  { "--wblosum", FALSE, sqdARG_NONE },  { "--wme",     FALSE, sqdARG_NONE },  { "--wnone",   FALSE, sqdARG_NONE },  { "--wpb",     FALSE, sqdARG_NONE },  { "--wvoronoi",FALSE, sqdARG_NONE },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))static void print_all_scores(FILE *fp, struct plan7_s *hmm, 			     char **dsq, MSA *msa, struct p7trace_s **tr);static void save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm);static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt,				   int nseq, struct p7trace_s **tr, float *pernode,				   float *ret_avg);static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, 			      float *pernode, float expected);static void maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa,			    float eff_nseq, 			    struct p7prior_s *prior, struct p7trace_s **tr);intmain(int argc, char **argv) {  char            *seqfile;     /* seqfile to read alignment from          */   int              format;	/* format of seqfile                       */  MSAFILE         *afp;         /* open alignment file                     */  MSA             *msa;         /* a multiple sequence alignment           */  char           **dsq;         /* digitized unaligned aseq's              */   struct plan7_s  *hmm;         /* constructed HMM; written to hmmfile     */  struct p7prior_s *pri;        /* Dirichlet priors to use                 */  struct p7trace_s **tr;        /* fake tracebacks for aseq's              */   char            *hmmfile;     /* file to write HMM to                    */  FILE            *hmmfp;       /* HMM output file handle                  */  char            *name;        /* name of the HMM                         */  int              idx;		/* counter for sequences                   */  float  randomseq[MAXABET];	/* null sequence model                     */  float            p1;		/* null sequence model p1 transition       */  int              nali;	/* count number of alignments/HMMs         */  char             fpopts[3];   /* options to open a file with, e.g. "ab"  */  int              checksum;	/* checksum of the alignment               */  char *optname;                /* name of option found by Getopt()      */  char *optarg;                 /* argument found by Getopt()            */  int   optind;                 /* index in argv[]                       */  enum p7_construction c_strategy; /* construction strategy choice        */  enum p7_weight {		/* weighting strategy */    WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_PB, WGT_VORONOI, WGT_ME} w_strategy;  enum p7_config {              /* algorithm configuration strategy      */    P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy;  float gapmax;			/* max frac gaps in mat col for -k       */  int   overwrite_protect;	/* TRUE to prevent overwriting HMM file  */  int   verbose;		/* TRUE to show a lot of output          */  char *rndfile;		/* random sequence model file to read    */  char *prifile;		/* Dirichlet prior file to read          */  char *pamfile;		/* PAM matrix file for heuristic prior   */  char *align_ofile;            /* name of output alignment file         */  char *cfile;			/* output file for count vectors         */  FILE *alignfp;                /* open filehandle for alignment resaves */  FILE *cfp;                    /* open filehandle for count vector saves*/  float archpri;		/* "architecture" prior on model size    */  float pamwgt;			/* weight on PAM for heuristic prior     */  int   do_append;		/* TRUE to append to hmmfile             */  int   do_binary;		/* TRUE to write in binary format        */  float blosumlevel;		/* BLOSUM frac id filtering level [0.62] */  float swentry;		/* S/W aggregate entry probability       */  float swexit;			/* S/W aggregate exit probability        */  int   do_eff;			/* TRUE to set an effective seq number   */  float eff_nseq;		/* effective sequence number             */  int   pbswitch;		/* nseq >= this, switchover to PB weights*/  char *setname;                /* NULL, or ptr to HMM name to set       */  int   gapmax_set;		/* TRUE if gapmax was set on commandline */  /***********************************************    * Parse command line   ***********************************************/    format            = MSAFILE_UNKNOWN;        /* autodetect format by default. */  c_strategy        = P7_MAP_CONSTRUCTION;  w_strategy        = WGT_GSC;  blosumlevel       = 0.62;  cfg_strategy      = P7_LS_CONFIG;  gapmax            = 0.5;  overwrite_protect = TRUE;  verbose           = FALSE;  rndfile           = NULL;  prifile           = NULL;  pamfile           = NULL;  align_ofile       = NULL;  alignfp           = NULL;  cfile             = NULL;  cfp               = NULL;  archpri           = 0.85;  pamwgt            = 20.;  Alphabet_type     = hmmNOTSETYET;	/* initially unknown */  name              = NULL;  do_append         = FALSE;   swentry           = 0.5;  swexit            = 0.5;  do_eff            = TRUE;  do_binary         = FALSE;  pbswitch          = 1000;  setname           = NULL;  gapmax_set        = FALSE;    while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,                &optind, &optname, &optarg))  {    if      (strcmp(optname, "-f") == 0) cfg_strategy      = P7_FS_CONFIG;    else if (strcmp(optname, "-g") == 0) cfg_strategy      = P7_BASE_CONFIG;    else if (strcmp(optname, "-n") == 0) setname           = optarg;     else if (strcmp(optname, "-o") == 0) align_ofile       = optarg;    else if (strcmp(optname, "-s") == 0) cfg_strategy      = P7_SW_CONFIG;    else if (strcmp(optname, "-A") == 0) do_append         = TRUE;     else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE;    else if (strcmp(optname, "--amino")   == 0) SetAlphabet(hmmAMINO);    else if (strcmp(optname, "--archpri") == 0) archpri       = atof(optarg);    else if (strcmp(optname, "--binary")  == 0) do_binary     = TRUE;    else if (strcmp(optname, "--cfile")   == 0) cfile         = optarg;    else if (strcmp(optname, "--fast")    == 0) c_strategy    = P7_FAST_CONSTRUCTION;    else if (strcmp(optname, "--gapmax")  == 0) { gapmax      = atof(optarg); gapmax_set = TRUE; }    else if (strcmp(optname, "--hand")    == 0) c_strategy    = P7_HAND_CONSTRUCTION;    else if (strcmp(optname, "--idlevel") == 0) blosumlevel   = atof(optarg);    else if (strcmp(optname, "--noeff")   == 0) do_eff        = FALSE;    else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC);    else if (strcmp(optname, "--null")    == 0) rndfile       = optarg;    else if (strcmp(optname, "--pam")     == 0) pamfile       = optarg;    else if (strcmp(optname, "--pamwgt")  == 0) pamwgt        = atof(optarg);    else if (strcmp(optname, "--pbswitch")== 0) pbswitch      = atoi(optarg);    else if (strcmp(optname, "--prior")   == 0) prifile       = optarg;    else if (strcmp(optname, "--swentry") == 0) swentry       = atof(optarg);     else if (strcmp(optname, "--swexit")  == 0) swexit        = atof(optarg);     else if (strcmp(optname, "--verbose") == 0) verbose       = TRUE;    else if (strcmp(optname, "--wgsc")    == 0) w_strategy    = WGT_GSC;    else if (strcmp(optname, "--wblosum") == 0) w_strategy    = WGT_BLOSUM;     else if (strcmp(optname, "--wme")     == 0) w_strategy    = WGT_ME;      else if (strcmp(optname, "--wpb")     == 0) w_strategy    = WGT_PB;      else if (strcmp(optname, "--wnone")   == 0) w_strategy    = WGT_NONE;     else if (strcmp(optname, "--wvoronoi")== 0) w_strategy    = WGT_VORONOI;    else if (strcmp(optname, "--informat") == 0) {      format = String2SeqfileFormat(optarg);      if (format == MSAFILE_UNKNOWN) 	Die("unrecognized sequence file format \"%s\"", optarg);      if (! IsAlignmentFormat(format))	Die("%s is an unaligned format, can't read as an alignment", optarg);    }    else if (strcmp(optname, "-h") == 0) {      Banner(stdout, banner);      puts(usage);      puts(experts);      exit(EXIT_SUCCESS);    }  }  if (argc - optind != 2)    Die("Incorrect number of arguments.\n%s\n", usage);  hmmfile = argv[optind++];  seqfile = argv[optind++];   if (gapmax < 0. || gapmax > 1.)     Die("--gapmax must be a value from 0 to 1\n%s\n", usage);  if (archpri < 0. || archpri > 1.)    Die("--archpri must be a value from 0 to 1\n%s\n", usage);  if (overwrite_protect && !do_append && FileExists(hmmfile))    Die("HMM file %s already exists. Rename or delete it.", hmmfile);   if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile))    Die("Alignment resave file %s exists. Rename or delete it.", align_ofile);   if (gapmax_set && c_strategy  != P7_FAST_CONSTRUCTION)    Die("using --gapmax only makes sense if you use --fast");  /***********************************************    * Preliminaries: open our files for i/o   ***********************************************/				/* Open the alignment */  if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL)    Die("Alignment file %s could not be opened for reading", seqfile);				/* Open the HMM output file */  if (do_append) strcpy(fpopts, "a");  else           strcpy(fpopts, "w");  if (do_binary) strcat(fpopts, "b");  if ((hmmfp = fopen(hmmfile, fpopts)) == NULL)    Die("Failed to open HMM file %s for %s\n", hmmfile, 	do_append ? "appending" : "writing");				/* Open the count vector save file */  cfp = NULL;  if (cfile != NULL)    if ((cfp = fopen(cfile, "w")) == NULL)      Die("Failed to open count vector file %s for writing\n", cfile);				/* Open the alignment resave file */  alignfp = NULL;  if (align_ofile != NULL)    if ((alignfp = fopen(align_ofile, "w")) == NULL)      Die("Failed to open alignment resave file %s for writing\n", align_ofile);  /***********************************************    * Show the banner   ***********************************************/  Banner(stdout, banner);  printf("Alignment file:                    %s\n", 	 seqfile);  printf("File format:                       %s\n", 	 SeqfileFormat2String(afp->format));  printf("Search algorithm configuration:    ");  if      (cfg_strategy == P7_BASE_CONFIG)   puts("Global alignment (hmms)");  else if (cfg_strategy == P7_SW_CONFIG)     {    puts("Local  (hmmsw)");    printf("S/W aggregate entry probability:   %.2f\n", swentry);    printf("S/W aggregate exit probability:    %.2f\n", swexit);  }  else if (cfg_strategy == P7_LS_CONFIG)     puts("Multiple domain (hmmls)");  else if (cfg_strategy == P7_FS_CONFIG)     {    puts("Multiple local (hmmfs)");    printf("S/W aggregate entry probability:   %.2f\n", swentry);    printf("S/W aggregate exit probability:    %.2f\n", swexit);  }  printf("Model construction strategy:       ");  if (c_strategy == P7_HAND_CONSTRUCTION)    puts("Manual, from #=RF annotation");  else if (c_strategy==P7_FAST_CONSTRUCTION) printf("Fast/ad hoc (gapmax %.2f)\n", gapmax);  else                                       printf("MAP (gapmax hint: %.2f)\n", gapmax);  printf("Null model used:                   %s\n",	 (rndfile == NULL) ? "(default)" : rndfile);  printf("Prior used:                        %s\n",	 (prifile == NULL) ? "(default)" : prifile);  printf("Sequence weighting method:         ");  if      (w_strategy == WGT_NONE)   puts("none");  else if (w_strategy == WGT_GSC)    puts("G/S/C tree weights");  else if (w_strategy == WGT_BLOSUM) printf("BLOSUM filter at %.2f id\n", blosumlevel);  else if (w_strategy == WGT_PB)     puts("Henikoff position-based");  else if (w_strategy == WGT_VORONOI)puts("Sibbald/Argos Voronoi");  else if (w_strategy == WGT_ME)     puts("Maximum entropy");  printf("New HMM file:                      %s %s\n",	 hmmfile, do_append? "[appending]" : "");  if (cfile != NULL)    printf("Count vectors saved to:            %s\n", cfile);  if (align_ofile != NULL)    printf("Annotated alignment(s) resaved to: %s\n", align_ofile);  printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");  /***********************************************    * Get alignment(s), build HMMs one at a time   ***********************************************/  nali = 0;  while ((msa = MSAFileRead(afp)) != NULL)    {      /* Print some stuff about what we're about to do.       */      if (msa->name != NULL) printf("Alignment:           %s\n",  msa->name);      else                   printf("Alignment:           #%d\n", nali+1);      printf                       ("Number of sequences: %d\n",  msa->nseq);      printf                       ("Number of columns:   %d\n",  msa->alen);      puts("");      fflush(stdout);	  

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -