📄 hmmpostal.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* Derived from code developed by Ian Holmes (Sanger Centre and UC Berkeley) * Copyright (C) 1998 Ian Holmes * Distributed under the GNU General Public License */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "structs.h" /* data structures, macros, #define's */#include "config.h" /* compile-time configuration constants */#include "funcs.h" /* function declarations */#include "globals.h" /* alphabet global variables */#include "squid.h" /* general sequence analysis library */static char banner[] = "hmmbuild - build a hidden Markov model from an alignment";static char usage[] = "\Usage: hmmbuildpost [-options] <alignment file>\n\ Available options are:\n\ -h : help; print brief help on version and usage\n\ -n <s> : name; name this HMM <s>\n\ -r <hmmfile> : read HMM from <file> instead of building\n\ -m <hmmfile> : save HMM to <file>\n\ -o <file> : re-save annotated alignment to <file>\n\ -A : append; append this HMM to <hmmfile>\n\ -F : force; allow overwriting of <hmmfile>\n\\n\ Alternative search algorithm styles: (default: hmmls domain alignment)\n\ -f : multi-hit local (hmmfs style)\n\ -g : global alignment (hmms style, Needleman/Wunsch)\n\ -s : local alignment (hmmsw style, Smith/Waterman)\n\";static char experts[] = "\ Optional re-alignment of sequences to model:\n\ --viterbi : standard max-likelihood (Viterbi) algorithm\n\ --optacc : optimal accuracy algorithm\n\\n\ Alternative model construction strategies: (default: MAP)\n\ --fast : Krogh/Haussler fast heuristic construction (see --gapmax)\n\ --hand : manual construction (requires SELEX file, #=RF annotation)\n\\n\ Expert customization of parameters and priors:\n\ --null <file> : read null (random sequence) model from <file>\n\ --pam <file> : heuristic PAM-based prior, using BLAST PAM matrix in <file>\n\ --prior <file> : read Dirichlet prior parameters from <file>\n\\n\ Alternative sequence weighting strategies: (default: GSC weights)\n\ --wblosum : Henikoff simple filter weights (see --idlevel)\n\ --wgsc : Gerstein/Sonnhammer/Chothia tree weights (default)\n\ --wme : maximum entropy (ME)\n\ --wvoronoi : Sibbald/Argos Voronoi weights\n\ --wnone : don't do any weighting\n\ --noeff : don't use effective sequence number; just use nseq\n\\n\ Forcing an alphabet: (normally autodetected)\n\ --amino : override autodetection, assert that seqs are protein\n\ --nucleic : override autodetection, assert that seqs are DNA/RNA\n\\n\ Other expert options:\n\ --archpri <x> : set architecture size prior to <x> {0.85} [0..1]\n\ --binary : save the model in binary format, not ASCII text\n\ --cfile <file>: save count vectors to <file>\n\ --gapmax <x> : max fraction of gaps in mat column {0.50} [0..1]\n\ --idlevel <x> : set frac. id level used by eff. nseq and --wblosum {0.62}\n\ --informat <s>: input alignment is in format <s>, not Stockholm\n\ --pamwgt <x> : set weight on PAM-based prior to <x> {20.}[>=0]\n\ --swentry <x> : set S/W aggregate entry prob. to <x> {0.5}\n\ --swexit <x> : set S/W aggregate exit prob. to <x> {0.5}\n\ --verbose : print a lot of boring information\n\\n";static struct opt_s OPTIONS[] = { { "-f", TRUE, sqdARG_NONE }, { "-g", TRUE, sqdARG_NONE }, { "-h", TRUE, sqdARG_NONE }, { "-n", TRUE, sqdARG_STRING}, { "-r", TRUE, sqdARG_STRING}, { "-m", TRUE, sqdARG_STRING}, { "-o", TRUE, sqdARG_STRING}, { "-s", TRUE, sqdARG_NONE }, { "-A", TRUE, sqdARG_NONE }, { "-F", TRUE, sqdARG_NONE }, { "--amino", FALSE, sqdARG_NONE }, { "--archpri", FALSE, sqdARG_FLOAT }, { "--binary", FALSE, sqdARG_NONE }, { "--cfile", FALSE, sqdARG_STRING}, { "--fast", FALSE, sqdARG_NONE}, { "--gapmax", FALSE, sqdARG_FLOAT }, { "--hand", FALSE, sqdARG_NONE}, { "--idlevel", FALSE, sqdARG_FLOAT }, { "--informat",FALSE, sqdARG_STRING }, { "--noeff", FALSE, sqdARG_NONE }, { "--nucleic", FALSE, sqdARG_NONE }, { "--null", FALSE, sqdARG_STRING }, { "--optacc", FALSE, sqdARG_NONE }, { "--pam", FALSE, sqdARG_STRING }, { "--pamwgt", FALSE, sqdARG_FLOAT }, { "--prior", FALSE, sqdARG_STRING }, { "--swentry", FALSE, sqdARG_FLOAT }, { "--swexit", FALSE, sqdARG_FLOAT }, { "--verbose", FALSE, sqdARG_NONE }, { "--viterbi", FALSE, sqdARG_NONE }, { "--wgsc", FALSE, sqdARG_NONE }, { "--wblosum", FALSE, sqdARG_NONE }, { "--wme", FALSE, sqdARG_NONE }, { "--wnone", FALSE, sqdARG_NONE }, { "--wvoronoi",FALSE, sqdARG_NONE },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))static void save_model(struct plan7_s *hmm, char *hmmfile, int do_append, int do_binary);static void print_all_scores(FILE *fp, struct plan7_s *hmm, AINFO *ainfo, char **dsq, int nseq, struct p7trace_s **tr);static void save_countvectors(char *cfile, struct plan7_s *hmm);static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt, int nseq, struct p7trace_s **tr, float *pernode, float *ret_avg);static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, float *pernode, float expected);static void maximum_entropy(struct plan7_s *hmm, char **dsq, AINFO *ainfo, int nseq, float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr);extern void Postcode(int L, struct dpmatrix_s *mx, struct p7trace_s *tr);intmain(int argc, char **argv) { char *seqfile; /* seqfile to read alignment from */ int format; /* format of seqfile */ MSAFILE *afp; /* open alignment file */ MSA *msa; /* a multiple sequence alignment */ char **dsq; /* digitized unaligned aseq's */ struct plan7_s *hmm; /* constructed HMM; written to hmmfile */ struct p7prior_s *pri; /* Dirichlet priors to use */ struct p7trace_s **tr; /* fake tracebacks for aseq's */ char *readfile; /* file to read HMM from */ HMMFILE *hmmfp; /* opened hmmfile for reading */ char *hmmfile; /* file to write HMM to */ FILE *fp; /* OUTPUT file handle (misc.) */ char *name; /* name of the HMM */ int idx; /* counter for sequences */ float randomseq[MAXABET]; /* null sequence model */ float p1; /* null sequence model p1 transition */ char *optname; /* name of option found by Getopt() */ char *optarg; /* argument found by Getopt() */ int optind; /* index in argv[] */ enum p7_construction c_strategy; /* construction strategy choice */ enum p7_weight { /* weighting strategy */ WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_VORONOI, WGT_ME} w_strategy; enum p7_config { /* algorithm configuration strategy */ P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy; float gapmax; /* max frac gaps in mat col for -k */ int overwrite_protect; /* TRUE to prevent overwriting HMM file */ enum realignment_strategy { /* re-alignment strategy */ REALIGN_NONE, REALIGN_VITERBI, REALIGN_OPTACC } r_strategy; int verbose; /* TRUE to show a lot of output */ char *align_ofile; /* name of output alignment file */ char *rndfile; /* random sequence model file to read */ char *prifile; /* Dirichlet prior file to read */ char *pamfile; /* PAM matrix file for heuristic prior */ char *cfile; /* output file for count vectors */ float archpri; /* "architecture" prior on model size */ float pamwgt; /* weight on PAM for heuristic prior */ int do_append; /* TRUE to append to hmmfile */ int do_binary; /* TRUE to write in binary format */ float blosumlevel; /* BLOSUM frac id filtering level [0.62] */ float swentry; /* S/W aggregate entry probability */ float swexit; /* S/W aggregate exit probability */ int do_eff; /* TRUE to set an effective seq number */ float eff_nseq; /* effective sequence number */ int checksum; int len; struct dpmatrix_s *forward_mx; /* Forward matrix */ struct dpmatrix_s *backward_mx; /* Backward matrix */ struct dpmatrix_s *posterior_mx; /* Posterior matrix */ struct dpmatrix_s *optacc_mx; /* Optimal accuracy matrix */ /*********************************************** * Parse command line ***********************************************/ format = MSAFILE_UNKNOWN; c_strategy = P7_MAP_CONSTRUCTION; w_strategy = WGT_GSC; blosumlevel = 0.62; cfg_strategy = P7_LS_CONFIG; gapmax = 0.5; overwrite_protect = TRUE; r_strategy = REALIGN_NONE; verbose = FALSE; readfile = NULL; hmmfile = NULL; align_ofile = NULL; rndfile = NULL; prifile = NULL; pamfile = NULL; cfile = NULL; archpri = 0.85; pamwgt = 20.; Alphabet_type = hmmNOTSETYET; /* initially unknown */ name = NULL; do_append = FALSE; swentry = 0.5; swexit = 0.5; do_eff = TRUE; do_binary = FALSE; while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "-f") == 0) cfg_strategy = P7_FS_CONFIG; else if (strcmp(optname, "-g") == 0) cfg_strategy = P7_BASE_CONFIG; else if (strcmp(optname, "-n") == 0) name = Strdup(optarg); else if (strcmp(optname, "-r") == 0) readfile = optarg; else if (strcmp(optname, "-m") == 0) hmmfile = optarg; else if (strcmp(optname, "-o") == 0) align_ofile = optarg; else if (strcmp(optname, "-r") == 0) rndfile = optarg; else if (strcmp(optname, "-s") == 0) cfg_strategy = P7_SW_CONFIG; else if (strcmp(optname, "-A") == 0) do_append = TRUE; else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE; else if (strcmp(optname, "--amino") == 0) SetAlphabet(hmmAMINO); else if (strcmp(optname, "--archpri") == 0) archpri = atof(optarg); else if (strcmp(optname, "--binary") == 0) do_binary = TRUE; else if (strcmp(optname, "--cfile") == 0) cfile = optarg; else if (strcmp(optname, "--fast") == 0) c_strategy = P7_FAST_CONSTRUCTION; else if (strcmp(optname, "--hand") == 0) c_strategy = P7_HAND_CONSTRUCTION; else if (strcmp(optname, "--gapmax") == 0) gapmax = atof(optarg); else if (strcmp(optname, "--idlevel") == 0) blosumlevel = atof(optarg); else if (strcmp(optname, "--noeff") == 0) do_eff = FALSE; else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC); else if (strcmp(optname, "--optacc") == 0) r_strategy = REALIGN_OPTACC; else if (strcmp(optname, "--pam") == 0) pamfile = optarg; else if (strcmp(optname, "--pamwgt") == 0) pamwgt = atof(optarg); else if (strcmp(optname, "--prior") == 0) prifile = optarg; else if (strcmp(optname, "--swentry") == 0) swentry = atof(optarg); else if (strcmp(optname, "--swexit") == 0) swexit = atof(optarg); else if (strcmp(optname, "--verbose") == 0) verbose = TRUE; else if (strcmp(optname, "--viterbi") == 0) r_strategy = REALIGN_VITERBI; else if (strcmp(optname, "--wgsc") == 0) w_strategy = WGT_GSC; else if (strcmp(optname, "--wblosum") == 0) w_strategy = WGT_BLOSUM; else if (strcmp(optname, "--wme") == 0) w_strategy = WGT_ME; else if (strcmp(optname, "--wnone") == 0) w_strategy = WGT_NONE; else if (strcmp(optname, "--wvoronoi")== 0) w_strategy = WGT_VORONOI; else if (strcmp(optname, "--informat") == 0) { format = String2SeqfileFormat(optarg); if (format == MSAFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); if (! IsAlignmentFormat(format)) Die("%s is an unaligned format, can't read as an alignment", optarg); } else if (strcmp(optname, "-h") == 0) { Banner(stdout, banner); puts(usage); puts(experts); exit(0); } } if (argc - optind != 1) Die("Incorrect number of arguments.\n%s\n", usage); seqfile = argv[optind++]; if (readfile != NULL && r_strategy == REALIGN_NONE) r_strategy = REALIGN_VITERBI; if (gapmax < 0. || gapmax > 1.) Die("--gapmax must be a value from 0 to 1\n%s\n", usage); if (archpri < 0. || archpri > 1.) Die("--archpri must be a value from 0 to 1\n%s\n", usage); if (overwrite_protect && hmmfile && !do_append && FileExists(hmmfile)) Die("HMM file %s already exists. Rename or delete it.", hmmfile); if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile)) Die("Alignment resave file %s exists. Rename or delete it.", align_ofile); /*********************************************** * Get sequence data ***********************************************/ /* Open the alignment */ if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL) Die("Alignment file %s could not be opened for reading", seqfile); /* read the alignment from file */ if ((msa = MSAFileRead(afp)) == NULL) Die("Failed to read aligned sequence file %s", seqfile); for (idx = 0; idx < msa->nseq; idx++) s2upper(msa->aseq[idx]); MSAFileClose(afp); /* Set up the alphabet globals */ if (Alphabet_type == hmmNOTSETYET) DetermineAlphabet(msa->aseq, msa->nseq); /* Set up Dirichlet priors */ if (prifile == NULL) pri = P7DefaultPrior(); else pri = P7ReadPrior(prifile); if (pamfile != NULL) PAMPrior(pamfile, pri, pamwgt); /* Set up the null/random seq model */ if (rndfile == NULL) P7DefaultNullModel(randomseq, &p1); else P7ReadNullModel(rndfile, randomseq, &p1); /* Prepare sequences for internal use */ DigitizeAlignment(msa, &dsq); /* In some respects we treat DNA more crudely... */ if (Alphabet_type == hmmNUCLEIC) { do_eff = FALSE; /* don't do effective seq #; it's calibrated for protein */ } /*********************************************** * Either read in an HMM or build from alignment, * depending on user specifications. ***********************************************/ if (readfile != NULL) { /*********************************************** * Open HMM file (might be in HMMERDB or current directory). * Read a single HMM from it. ***********************************************/ if ((hmmfp = HMMFileOpen(readfile, "HMMERDB")) == NULL) Die("Failed to open HMM file %s\n%s", readfile, usage); if (!HMMFileRead(hmmfp, &hmm)) Die("Failed to read any HMMs from %s\n", readfile); HMMFileClose(hmmfp); if (hmm == NULL) Die("HMM file %s corrupt or in incorrect format? Parse failed", readfile); tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * msa->nseq); for (idx = 0; idx < msa->nseq; idx++) tr[idx] = 0; } else { /*********************************************** * Build an HMM ***********************************************/ /* Determine the effective sequence number to use (optional) */ eff_nseq = (float) msa->nseq; if (do_eff) { float *wgt; printf("%-40s ... ", "Determining effective sequence number"); fflush(stdout); /* dummy weights array to feed BlosumWeights*/ wgt = MallocOrDie(sizeof(float) * msa->nseq); BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt); eff_nseq = FSum(wgt, msa->nseq); free(wgt);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -