📄 hmmbuild.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* hmmbuild.c * SRE, Mon Nov 18 12:41:29 1996 * * main() for HMM construction from an alignment. * CVS $Id: hmmbuild.c,v 1.23 2000/12/24 22:55:18 eddy Exp $ */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "structs.h" /* data structures, macros, #define's */#include "config.h" /* compile-time configuration constants */#include "funcs.h" /* function declarations */#include "globals.h" /* alphabet global variables */#include "squid.h" /* general sequence analysis library */#include "msa.h" /* squid's multiple alignment i/o */static char banner[] = "hmmbuild - build a hidden Markov model from an alignment";static char usage[] = "\Usage: hmmbuild [-options] <hmmfile output> <alignment file>\n\ Available options are:\n\ -h : help; print brief help on version and usage\n\ -n <s> : name; name this (first) HMM <s>\n\ -o <f> : re-save annotated alignment to <f>\n\ -A : append; append this HMM to <hmmfile>\n\ -F : force; allow overwriting of <hmmfile>\n\\n\ Alternative search algorithm styles: (default: hmmls domain alignment)\n\ -f : multi-hit local (hmmfs style)\n\ -g : global alignment (hmms style, Needleman/Wunsch)\n\ -s : local alignment (hmmsw style, Smith/Waterman)\n\";static char experts[] = "\ Alternative model construction strategies: (default: MAP)\n\ --fast : Krogh/Haussler fast heuristic construction (see --gapmax)\n\ --hand : manual construction (requires annotated alignment)\n\\n\ Expert customization of parameters and priors:\n\ --null <f> : read null (random sequence) model from <f>\n\ --pam <f> : heuristic PAM-based prior, using BLAST PAM matrix in <f>\n\ --prior <f> : read Dirichlet prior parameters from <f>\n\\n\ Alternative sequence weighting strategies: (default: GSC weights)\n\ --wblosum : Henikoff simple filter weights (see --idlevel)\n\ --wgsc : Gerstein/Sonnhammer/Chothia tree weights (default)\n\ --wme : maximum entropy (ME)\n\ --wpb : Henikoff position-based weights\n\ --wvoronoi : Sibbald/Argos Voronoi weights\n\ --wnone : don't do any weighting\n\ --noeff : don't use effective sequence number; just use nseq\n\ --pbswitch <n>: set switch from GSC to position-based wgts at > n seqs\n\\n\ Forcing an alphabet: (normally autodetected)\n\ --amino : override autodetection, assert that seqs are protein\n\ --nucleic : override autodetection, assert that seqs are DNA/RNA\n\\n\ Other expert options:\n\ --archpri <x> : set architecture size prior to <x> {0.85} [0..1]\n\ --binary : save the model in binary format, not ASCII text\n\ --cfile <f> : save count vectors to <f>\n\ --gapmax <x> : max fraction of gaps in mat column {0.50} [0..1]\n\ --idlevel <x> : set frac. id level used by eff. nseq and --wblosum {0.62}\n\ --informat <s>: input alignment is in format <s>, not Stockholm\n\ --pamwgt <x> : set weight on PAM-based prior to <x> {20.}[>=0]\n\ --swentry <x> : set S/W aggregate entry prob. to <x> {0.5}\n\ --swexit <x> : set S/W aggregate exit prob. to <x> {0.5}\n\ --verbose : print boring information\n\\n";static struct opt_s OPTIONS[] = { { "-f", TRUE, sqdARG_NONE }, { "-g", TRUE, sqdARG_NONE }, { "-h", TRUE, sqdARG_NONE }, { "-n", TRUE, sqdARG_STRING}, { "-o", TRUE, sqdARG_STRING}, { "-s", TRUE, sqdARG_NONE }, { "-A", TRUE, sqdARG_NONE }, { "-F", TRUE, sqdARG_NONE }, { "--amino", FALSE, sqdARG_NONE }, { "--archpri", FALSE, sqdARG_FLOAT }, { "--binary", FALSE, sqdARG_NONE }, { "--cfile", FALSE, sqdARG_STRING}, { "--fast", FALSE, sqdARG_NONE}, { "--gapmax", FALSE, sqdARG_FLOAT }, { "--hand", FALSE, sqdARG_NONE}, { "--idlevel", FALSE, sqdARG_FLOAT }, { "--informat",FALSE, sqdARG_STRING }, { "--noeff", FALSE, sqdARG_NONE }, { "--nucleic", FALSE, sqdARG_NONE }, { "--null", FALSE, sqdARG_STRING }, { "--pam", FALSE, sqdARG_STRING }, { "--pamwgt", FALSE, sqdARG_FLOAT }, { "--pbswitch",FALSE, sqdARG_INT }, { "--prior", FALSE, sqdARG_STRING }, { "--swentry", FALSE, sqdARG_FLOAT }, { "--swexit", FALSE, sqdARG_FLOAT }, { "--verbose", FALSE, sqdARG_NONE }, { "--wgsc", FALSE, sqdARG_NONE }, { "--wblosum", FALSE, sqdARG_NONE }, { "--wme", FALSE, sqdARG_NONE }, { "--wnone", FALSE, sqdARG_NONE }, { "--wpb", FALSE, sqdARG_NONE }, { "--wvoronoi",FALSE, sqdARG_NONE },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))static void print_all_scores(FILE *fp, struct plan7_s *hmm, char **dsq, MSA *msa, struct p7trace_s **tr);static void save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm);static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt, int nseq, struct p7trace_s **tr, float *pernode, float *ret_avg);static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, float *pernode, float expected);static void maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa, float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr);intmain(int argc, char **argv) { char *seqfile; /* seqfile to read alignment from */ int format; /* format of seqfile */ MSAFILE *afp; /* open alignment file */ MSA *msa; /* a multiple sequence alignment */ char **dsq; /* digitized unaligned aseq's */ struct plan7_s *hmm; /* constructed HMM; written to hmmfile */ struct p7prior_s *pri; /* Dirichlet priors to use */ struct p7trace_s **tr; /* fake tracebacks for aseq's */ char *hmmfile; /* file to write HMM to */ FILE *hmmfp; /* HMM output file handle */ char *name; /* name of the HMM */ int idx; /* counter for sequences */ float randomseq[MAXABET]; /* null sequence model */ float p1; /* null sequence model p1 transition */ int nali; /* count number of alignments/HMMs */ char fpopts[3]; /* options to open a file with, e.g. "ab" */ int checksum; /* checksum of the alignment */ char *optname; /* name of option found by Getopt() */ char *optarg; /* argument found by Getopt() */ int optind; /* index in argv[] */ enum p7_construction c_strategy; /* construction strategy choice */ enum p7_weight { /* weighting strategy */ WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_PB, WGT_VORONOI, WGT_ME} w_strategy; enum p7_config { /* algorithm configuration strategy */ P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy; float gapmax; /* max frac gaps in mat col for -k */ int overwrite_protect; /* TRUE to prevent overwriting HMM file */ int verbose; /* TRUE to show a lot of output */ char *rndfile; /* random sequence model file to read */ char *prifile; /* Dirichlet prior file to read */ char *pamfile; /* PAM matrix file for heuristic prior */ char *align_ofile; /* name of output alignment file */ char *cfile; /* output file for count vectors */ FILE *alignfp; /* open filehandle for alignment resaves */ FILE *cfp; /* open filehandle for count vector saves*/ float archpri; /* "architecture" prior on model size */ float pamwgt; /* weight on PAM for heuristic prior */ int do_append; /* TRUE to append to hmmfile */ int do_binary; /* TRUE to write in binary format */ float blosumlevel; /* BLOSUM frac id filtering level [0.62] */ float swentry; /* S/W aggregate entry probability */ float swexit; /* S/W aggregate exit probability */ int do_eff; /* TRUE to set an effective seq number */ float eff_nseq; /* effective sequence number */ int pbswitch; /* nseq >= this, switchover to PB weights*/ char *setname; /* NULL, or ptr to HMM name to set */ int gapmax_set; /* TRUE if gapmax was set on commandline */ /*********************************************** * Parse command line ***********************************************/ format = MSAFILE_UNKNOWN; /* autodetect format by default. */ c_strategy = P7_MAP_CONSTRUCTION; w_strategy = WGT_GSC; blosumlevel = 0.62; cfg_strategy = P7_LS_CONFIG; gapmax = 0.5; overwrite_protect = TRUE; verbose = FALSE; rndfile = NULL; prifile = NULL; pamfile = NULL; align_ofile = NULL; alignfp = NULL; cfile = NULL; cfp = NULL; archpri = 0.85; pamwgt = 20.; Alphabet_type = hmmNOTSETYET; /* initially unknown */ name = NULL; do_append = FALSE; swentry = 0.5; swexit = 0.5; do_eff = TRUE; do_binary = FALSE; pbswitch = 1000; setname = NULL; gapmax_set = FALSE; while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "-f") == 0) cfg_strategy = P7_FS_CONFIG; else if (strcmp(optname, "-g") == 0) cfg_strategy = P7_BASE_CONFIG; else if (strcmp(optname, "-n") == 0) setname = optarg; else if (strcmp(optname, "-o") == 0) align_ofile = optarg; else if (strcmp(optname, "-s") == 0) cfg_strategy = P7_SW_CONFIG; else if (strcmp(optname, "-A") == 0) do_append = TRUE; else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE; else if (strcmp(optname, "--amino") == 0) SetAlphabet(hmmAMINO); else if (strcmp(optname, "--archpri") == 0) archpri = atof(optarg); else if (strcmp(optname, "--binary") == 0) do_binary = TRUE; else if (strcmp(optname, "--cfile") == 0) cfile = optarg; else if (strcmp(optname, "--fast") == 0) c_strategy = P7_FAST_CONSTRUCTION; else if (strcmp(optname, "--gapmax") == 0) { gapmax = atof(optarg); gapmax_set = TRUE; } else if (strcmp(optname, "--hand") == 0) c_strategy = P7_HAND_CONSTRUCTION; else if (strcmp(optname, "--idlevel") == 0) blosumlevel = atof(optarg); else if (strcmp(optname, "--noeff") == 0) do_eff = FALSE; else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC); else if (strcmp(optname, "--null") == 0) rndfile = optarg; else if (strcmp(optname, "--pam") == 0) pamfile = optarg; else if (strcmp(optname, "--pamwgt") == 0) pamwgt = atof(optarg); else if (strcmp(optname, "--pbswitch")== 0) pbswitch = atoi(optarg); else if (strcmp(optname, "--prior") == 0) prifile = optarg; else if (strcmp(optname, "--swentry") == 0) swentry = atof(optarg); else if (strcmp(optname, "--swexit") == 0) swexit = atof(optarg); else if (strcmp(optname, "--verbose") == 0) verbose = TRUE; else if (strcmp(optname, "--wgsc") == 0) w_strategy = WGT_GSC; else if (strcmp(optname, "--wblosum") == 0) w_strategy = WGT_BLOSUM; else if (strcmp(optname, "--wme") == 0) w_strategy = WGT_ME; else if (strcmp(optname, "--wpb") == 0) w_strategy = WGT_PB; else if (strcmp(optname, "--wnone") == 0) w_strategy = WGT_NONE; else if (strcmp(optname, "--wvoronoi")== 0) w_strategy = WGT_VORONOI; else if (strcmp(optname, "--informat") == 0) { format = String2SeqfileFormat(optarg); if (format == MSAFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); if (! IsAlignmentFormat(format)) Die("%s is an unaligned format, can't read as an alignment", optarg); } else if (strcmp(optname, "-h") == 0) { Banner(stdout, banner); puts(usage); puts(experts); exit(EXIT_SUCCESS); } } if (argc - optind != 2) Die("Incorrect number of arguments.\n%s\n", usage); hmmfile = argv[optind++]; seqfile = argv[optind++]; if (gapmax < 0. || gapmax > 1.) Die("--gapmax must be a value from 0 to 1\n%s\n", usage); if (archpri < 0. || archpri > 1.) Die("--archpri must be a value from 0 to 1\n%s\n", usage); if (overwrite_protect && !do_append && FileExists(hmmfile)) Die("HMM file %s already exists. Rename or delete it.", hmmfile); if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile)) Die("Alignment resave file %s exists. Rename or delete it.", align_ofile); if (gapmax_set && c_strategy != P7_FAST_CONSTRUCTION) Die("using --gapmax only makes sense if you use --fast"); /*********************************************** * Preliminaries: open our files for i/o ***********************************************/ /* Open the alignment */ if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL) Die("Alignment file %s could not be opened for reading", seqfile); /* Open the HMM output file */ if (do_append) strcpy(fpopts, "a"); else strcpy(fpopts, "w"); if (do_binary) strcat(fpopts, "b"); if ((hmmfp = fopen(hmmfile, fpopts)) == NULL) Die("Failed to open HMM file %s for %s\n", hmmfile, do_append ? "appending" : "writing"); /* Open the count vector save file */ cfp = NULL; if (cfile != NULL) if ((cfp = fopen(cfile, "w")) == NULL) Die("Failed to open count vector file %s for writing\n", cfile); /* Open the alignment resave file */ alignfp = NULL; if (align_ofile != NULL) if ((alignfp = fopen(align_ofile, "w")) == NULL) Die("Failed to open alignment resave file %s for writing\n", align_ofile); /*********************************************** * Show the banner ***********************************************/ Banner(stdout, banner); printf("Alignment file: %s\n", seqfile); printf("File format: %s\n", SeqfileFormat2String(afp->format)); printf("Search algorithm configuration: "); if (cfg_strategy == P7_BASE_CONFIG) puts("Global alignment (hmms)"); else if (cfg_strategy == P7_SW_CONFIG) { puts("Local (hmmsw)"); printf("S/W aggregate entry probability: %.2f\n", swentry); printf("S/W aggregate exit probability: %.2f\n", swexit); } else if (cfg_strategy == P7_LS_CONFIG) puts("Multiple domain (hmmls)"); else if (cfg_strategy == P7_FS_CONFIG) { puts("Multiple local (hmmfs)"); printf("S/W aggregate entry probability: %.2f\n", swentry); printf("S/W aggregate exit probability: %.2f\n", swexit); } printf("Model construction strategy: "); if (c_strategy == P7_HAND_CONSTRUCTION) puts("Manual, from #=RF annotation"); else if (c_strategy==P7_FAST_CONSTRUCTION) printf("Fast/ad hoc (gapmax %.2f)\n", gapmax); else printf("MAP (gapmax hint: %.2f)\n", gapmax); printf("Null model used: %s\n", (rndfile == NULL) ? "(default)" : rndfile); printf("Prior used: %s\n", (prifile == NULL) ? "(default)" : prifile); printf("Sequence weighting method: "); if (w_strategy == WGT_NONE) puts("none"); else if (w_strategy == WGT_GSC) puts("G/S/C tree weights"); else if (w_strategy == WGT_BLOSUM) printf("BLOSUM filter at %.2f id\n", blosumlevel); else if (w_strategy == WGT_PB) puts("Henikoff position-based"); else if (w_strategy == WGT_VORONOI)puts("Sibbald/Argos Voronoi"); else if (w_strategy == WGT_ME) puts("Maximum entropy"); printf("New HMM file: %s %s\n", hmmfile, do_append? "[appending]" : ""); if (cfile != NULL) printf("Count vectors saved to: %s\n", cfile); if (align_ofile != NULL) printf("Annotated alignment(s) resaved to: %s\n", align_ofile); printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); /*********************************************** * Get alignment(s), build HMMs one at a time ***********************************************/ nali = 0; while ((msa = MSAFileRead(afp)) != NULL) { /* Print some stuff about what we're about to do. */ if (msa->name != NULL) printf("Alignment: %s\n", msa->name); else printf("Alignment: #%d\n", nali+1); printf ("Number of sequences: %d\n", msa->nseq); printf ("Number of columns: %d\n", msa->alen); puts(""); fflush(stdout);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -