📄 hmmsearch.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* hmmsearch.c * SRE, Tue Jan 7 17:19:20 1997 [St. Louis] * * Search a sequence database with a profile HMM. * Conditionally includes PVM parallelization when HMMER_PVM is defined * at compile time; hmmsearch --pvm runs the PVM version. * * CVS $Id: hmmsearch.c,v 1.34 2001/06/11 14:49:47 eddy Exp $ */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <limits.h>#include <float.h>#ifdef HMMER_THREADS#include <pthread.h>#endif#ifdef HMMER_PVM#include <pvm3.h>#endif#include "squid.h" /* general sequence analysis library */#include "config.h" /* compile-time configuration constants */#include "structs.h" /* data structures, macros, #define's */#include "funcs.h" /* function declarations */#include "globals.h" /* alphabet global variables */#include "version.h" /* version info */static char banner[] = "hmmsearch - search a sequence database with a profile HMM";static char usage[] = "\Usage: hmmsearch [-options] <hmmfile> <sequence file or database>\n\ Available options are:\n\ -h : help; print brief help on version and usage\n\ -A <n> : sets alignment output limit to <n> best domain alignments\n\ -E <x> : sets E value cutoff (globE) to <= x\n\ -T <x> : sets T bit threshold (globT) to >= x\n\ -Z <n> : sets Z (# seqs) for E-value calculation\n\";static char experts[] = "\ --compat : make best effort to use last version's output style\n\ --cpu <n> : run <n> threads in parallel (if threaded)\n\ --cut_ga : use Pfam GA gathering threshold cutoffs\n\ --cut_nc : use Pfam NC noise threshold cutoffs\n\ --cut_tc : use Pfam TC trusted threshold cutoffs\n\ --domE <x>: sets domain Eval cutoff (2nd threshold) to <= x\n\ --domT <x>: sets domain T bit thresh (2nd threshold) to >= x\n\ --forward : use the full Forward() algorithm instead of Viterbi\n\ --informat <s>: sequence file is in format <s>, not FASTA\n\ --null2 : turn OFF the post hoc second null model\n\ --pvm : run on a Parallel Virtual Machine (PVM)\n\ --xnu : turn ON XNU filtering of target protein sequences\n\";static struct opt_s OPTIONS[] = { { "-h", TRUE, sqdARG_NONE }, { "-A", TRUE, sqdARG_INT }, { "-E", TRUE, sqdARG_FLOAT}, { "-T", TRUE, sqdARG_FLOAT}, { "-Z", TRUE, sqdARG_INT }, { "--compat", FALSE, sqdARG_NONE }, { "--cpu", FALSE, sqdARG_INT }, { "--cut_ga", FALSE, sqdARG_NONE }, { "--cut_nc", FALSE, sqdARG_NONE }, { "--cut_tc", FALSE, sqdARG_NONE }, { "--domE", FALSE, sqdARG_FLOAT}, { "--domT", FALSE, sqdARG_FLOAT}, { "--forward", FALSE, sqdARG_NONE }, { "--informat",FALSE, sqdARG_STRING}, { "--null2", FALSE, sqdARG_NONE }, { "--pvm", FALSE, sqdARG_NONE }, { "--xnu", FALSE, sqdARG_NONE },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))#ifdef HMMER_THREADS/* POSIX threads version: * the threads share a workpool_s structure amongst themselves, * for obtaining locks on input HMM file and output histogram and * tophits structures. */struct workpool_s { /* Shared configuration resources which don't change: */ struct plan7_s *hmm; /* HMM to search with */ int do_xnu; /* TRUE to apply XNU filter */ int do_forward; /* TRUE to score using Forward */ int do_null2; /* TRUE to apply null2 ad hoc correction */ struct threshold_s *thresh; /* score/evalue threshold info */ /* Shared (mutex-protected) input resources: */ SQFILE *sqfp; /* ptr to open sequence file */ int nseq; /* number of seqs searched so far */ pthread_mutex_t input_lock; /* mutex for locking input */ /* Shared (mutex-protected) output resources: */ struct tophit_s *ghit; /* per-sequence top hits */ struct tophit_s *dhit; /* per-domain top hits */ struct histogram_s *hist; /* histogram of scores */ pthread_mutex_t output_lock; /* mutex for locking output */ /* Thread pool information */ pthread_t *thread; /* our pool of threads */ int num_threads; /* number of threads */};static struct workpool_s *workpool_start(struct plan7_s *hmm, SQFILE *sqfp, int do_xnu, int do_forward, int do_null2, struct threshold_s *thresh, struct tophit_s *ghit, struct tophit_s *dhit, struct histogram_s *hist, int num_threads);static void workpool_stop(struct workpool_s *wpool);static void workpool_free(struct workpool_s *wpool);static void *worker_thread(void *ptr);#endif /* HMMER_THREADS */static void main_loop_serial(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, int do_null2, int do_xnu, int num_threads, struct histogram_s *histogram, struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq);#ifdef HMMER_PVMstatic void main_loop_pvm(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, int do_null2, int do_xnu, struct histogram_s *histogram, struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq);#endifintmain(int argc, char **argv) { char *hmmfile; /* file to read HMM(s) from */ HMMFILE *hmmfp; /* opened hmmfile for reading */ char *seqfile; /* file to read target sequence(s) from */ SQFILE *sqfp; /* opened seqfile for reading */ int format; /* format of seqfile */ int i; struct plan7_s *hmm; /* HMM to search with */ struct histogram_s *histogram;/* histogram of all scores */ struct fancyali_s *ali; /* displayed alignment info */ struct tophit_s *ghit; /* list of top hits for whole sequences */ struct tophit_s *dhit; /* list of top hits for domains */ float sc; /* score of an HMM search */ double pvalue; /* pvalue of an HMM score */ double evalue; /* evalue of an HMM score */ double motherp; /* pvalue of a whole seq HMM score */ float mothersc; /* score of a whole seq parent of domain */ int sqfrom, sqto; /* coordinates in sequence */ int hmmfrom, hmmto; /* coordinate in HMM */ char *name, *acc, *desc; /* hit sequence name and description */ int sqlen; /* length of seq that was hit */ int nseq; /* number of sequences searched */ int Z; /* # of seqs for purposes of E-val calc */ int domidx; /* number of this domain */ int ndom; /* total # of domains in this seq */ int namewidth; /* max width of sequence name */ int descwidth; /* max width of description */ int nreported; /* # of hits reported in a list */ int Alimit; /* A parameter limiting output alignments */ struct threshold_s thresh; /* contains all threshold (cutoff) info */ char *optname; /* name of option found by Getopt() */ char *optarg; /* argument found by Getopt() */ int optind; /* index in argv[] */ int do_null2; /* TRUE to adjust scores with null model #2 */ int do_forward; /* TRUE to use Forward() not Viterbi() */ int do_xnu; /* TRUE to filter sequences thru XNU */ int do_pvm; /* TRUE to run on Parallel Virtual Machine */ int be_backwards; /* TRUE to be backwards-compatible in output*/ int num_threads; /* number of worker threads */ /*********************************************** * Parse command line ***********************************************/ format = SQFILE_UNKNOWN; /* default: autodetect seq file format w/ Babelfish */ do_forward = FALSE; do_null2 = TRUE; do_xnu = FALSE; do_pvm = FALSE; Z = 0; be_backwards= FALSE; Alimit = INT_MAX; /* no limit on alignment output */ thresh.globE = 10.0; /* use a reasonable Eval threshold; */ thresh.globT = -FLT_MAX; /* but no bit threshold, */ thresh.domT = -FLT_MAX; /* no domain bit threshold, */ thresh.domE = FLT_MAX; /* and no domain Eval threshold. */ thresh.autocut = CUT_NONE; /* and no Pfam cutoffs used */ thresh.Z = 0; /* Z not preset; use actual # of seqs */#ifdef HMMER_THREADS num_threads = ThreadNumber(); /* only matters if we're threaded */#else num_threads = 0;#endif while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg); else if (strcmp(optname, "-E") == 0) thresh.globE = atof(optarg); else if (strcmp(optname, "-T") == 0) thresh.globT = atof(optarg); else if (strcmp(optname, "-Z") == 0) thresh.Z = atoi(optarg); else if (strcmp(optname, "--compat") == 0) be_backwards = TRUE; else if (strcmp(optname, "--cpu") == 0) num_threads = atoi(optarg); else if (strcmp(optname, "--cut_ga") == 0) thresh.autocut = CUT_GA; else if (strcmp(optname, "--cut_nc") == 0) thresh.autocut = CUT_NC; else if (strcmp(optname, "--cut_tc") == 0) thresh.autocut = CUT_TC; else if (strcmp(optname, "--domE") == 0) thresh.domE = atof(optarg); else if (strcmp(optname, "--domT") == 0) thresh.domT = atof(optarg); else if (strcmp(optname, "--forward") == 0) do_forward = TRUE; else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE; else if (strcmp(optname, "--pvm") == 0) do_pvm = TRUE; else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE; else if (strcmp(optname, "--informat") == 0) { format = String2SeqfileFormat(optarg); if (format == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if (strcmp(optname, "-h") == 0) { Banner(stdout, banner); puts(usage); puts(experts); exit(0); } } if (argc - optind != 2) Die("Incorrect number of arguments.\n%s\n", usage); hmmfile = argv[optind++]; seqfile = argv[optind++]; #ifndef HMMER_PVM if (do_pvm) Die("PVM support is not compiled into your HMMER software; --pvm doesn't work.");#endif#ifndef HMMER_THREADS if (num_threads) Die("Posix threads support is not compiled into HMMER; --cpu doesn't have any effect");#endif /*********************************************** * Open sequence database (might be in BLASTDB or current directory) ***********************************************/ if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL) Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); /*********************************************** * Open HMM file (might be in HMMERDB or current directory). * Read a single HMM from it. (Config HMM, if necessary). * Alphabet globals are set by reading the HMM. ***********************************************/ if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) Die("Failed to open HMM file %s\n%s", hmmfile, usage); if (!HMMFileRead(hmmfp, &hmm)) Die("Failed to read any HMMs from %s\n", hmmfile); if (hmm == NULL) Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); P7Logoddsify(hmm, !do_forward); if (do_xnu && Alphabet_type == hmmNUCLEIC) Die("The HMM is a DNA model, and you can't use the --xnu filter on DNA data"); /***************************************************************** * Set up optional Pfam score thresholds. * Can do this before starting any searches, since we'll only use 1 HMM. *****************************************************************/ if (! SetAutocuts(&thresh, hmm)) Die("HMM %s did not contain the GA, TC, or NC cutoffs you needed", hmm->name); /*********************************************** * Show the banner ***********************************************/ Banner(stdout, banner); printf( "HMM file: %s [%s]\n", hmmfile, hmm->name); printf( "Sequence database: %s\n", seqfile); if (do_pvm) printf( "PVM: ACTIVE\n"); printf( "per-sequence score cutoff: "); if (thresh.globT == -FLT_MAX) printf("[none]\n"); else { printf(">= %.1f", thresh.globT); if (thresh.autocut == CUT_GA) printf(" [GA1]\n"); else if (thresh.autocut == CUT_NC) printf(" [NC1]\n"); else if (thresh.autocut == CUT_TC) printf(" [TC1]\n"); else printf("\n"); } printf( "per-domain score cutoff: "); if (thresh.domT == -FLT_MAX) printf("[none]\n"); else { printf(">= %.1f", thresh.domT); if (thresh.autocut == CUT_GA) printf(" [GA2]\n"); else if (thresh.autocut == CUT_NC) printf(" [NC2]\n"); else if (thresh.autocut == CUT_TC) printf(" [TC2]\n"); else printf("\n"); } printf( "per-sequence Eval cutoff: "); if (thresh.globE == FLT_MAX) printf("[none]\n"); else printf("<= %-10.2g\n", thresh.globE); printf( "per-domain Eval cutoff: "); if (thresh.domE == FLT_MAX) printf("[none]\n"); else printf("<= %10.2g\n", thresh.domE); printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n"); /*********************************************** * Search HMM against each sequence ***********************************************/ /* set up structures for storing output */ histogram = AllocHistogram(-200, 200, 100); /* keeps full histogram */ ghit = AllocTophits(200); /* per-seq hits: 200=lumpsize */ dhit = AllocTophits(200); /* domain hits: 200=lumpsize */ if (! do_pvm) main_loop_serial(hmm, sqfp, &thresh, do_forward, do_null2, do_xnu, num_threads, histogram, ghit, dhit, &nseq);#ifdef HMMER_PVM else main_loop_pvm(hmm, sqfp, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq);#endif /*********************************************** * Process hit lists, produce text output ***********************************************/ /* Set the theoretical EVD curve in our histogram using * calibration in the HMM, if available. */ if (hmm->flags & PLAN7_STATS) ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda, histogram->lowscore, histogram->highscore, 0); if (!thresh.Z) thresh.Z = nseq; /* set Z for good now that we're done. */ /* Format and report our output */ /* 1. Report overall sequence hits (sorted on E-value) */ if (be_backwards) { printf("\nQuery HMM: %s|%s|%s\n", hmm->name, hmm->flags & PLAN7_ACC ? hmm->acc : "", hmm->flags & PLAN7_DESC ? hmm->desc : ""); } else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -