📄 hmmpfam.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* hmmpfam.c * SRE, Mon Aug 25 17:03:14 1997 [Denver] * * Search a single sequence against an HMM database. * Conditionally includes PVM parallelization when HMMER_PVM is defined * at compile time; hmmpfam --pvm runs the PVM version. * * CVS $Id: hmmpfam.c,v 1.23 2001/06/11 14:49:47 eddy Exp $ */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <limits.h>#include <float.h>#ifdef HMMER_THREADS#include <pthread.h>#endif#ifdef HMMER_PVM#include <pvm3.h>#endif#include "squid.h" /* general sequence analysis library */#include "config.h" /* compile-time configuration constants */#include "structs.h" /* data structures, macros, #define's */#include "funcs.h" /* function declarations */#include "globals.h" /* alphabet global variables */#include "version.h" /* version info */static char banner[] = "hmmpfam - search one or more sequences against HMM database";static char usage[] = "\Usage: hmmpfam [-options] <hmm database> <sequence file or database>\n\ Available options are:\n\ -h : help; print brief help on version and usage\n\ -n : nucleic acid models/sequence (default protein)\n\ -A <n> : sets alignment output limit to <n> best domain alignments\n\ -E <x> : sets E value cutoff (globE) to <x>; default 10\n\ -T <x> : sets T bit threshold (globT) to <x>; no threshold by default\n\ -Z <n> : sets Z (# models) for E-value calculation\n\";static char experts[] = "\ --acc : use HMM accession numbers instead of names in output\n\ --compat : make best effort to use last version's output style\n\ --cpu <n> : run <n> threads in parallel (if threaded)\n\ --cut_ga : use Pfam GA gathering threshold cutoffs\n\ --cut_nc : use Pfam NC noise threshold cutoffs\n\ --cut_tc : use Pfam TC trusted threshold cutoffs\n\ --domE <x> : sets domain Eval cutoff (2nd threshold) to <x>\n\ --domT <x> : sets domain T bit thresh (2nd threshold) to <x>\n\ --forward : use the full Forward() algorithm instead of Viterbi\n\ --informat <s>: sequence file is in format <s>, not FASTA\n\ --null2 : turn OFF the post hoc second null model\n\ --pvm : run on a PVM (Parallel Virtual Machine) cluster\n\ --xnu : turn ON XNU filtering of query protein sequence\n\\n";static struct opt_s OPTIONS[] = { { "-h", TRUE, sqdARG_NONE }, { "-n", TRUE, sqdARG_NONE }, { "-A", TRUE, sqdARG_INT }, { "-E", TRUE, sqdARG_FLOAT}, { "-T", TRUE, sqdARG_FLOAT}, { "-Z", TRUE, sqdARG_INT }, { "--acc", FALSE, sqdARG_NONE }, { "--compat", FALSE, sqdARG_NONE }, { "--cpu", FALSE, sqdARG_INT }, { "--cut_ga", FALSE, sqdARG_NONE }, { "--cut_nc", FALSE, sqdARG_NONE }, { "--cut_tc", FALSE, sqdARG_NONE }, { "--domE", FALSE, sqdARG_FLOAT}, { "--domT", FALSE, sqdARG_FLOAT}, { "--forward", FALSE, sqdARG_NONE }, { "--informat",FALSE, sqdARG_STRING}, { "--null2", FALSE, sqdARG_NONE }, { "--pvm", FALSE, sqdARG_NONE }, { "--xnu", FALSE, sqdARG_NONE },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))#ifdef HMMER_THREADS/* POSIX threads version: * the threads share a workpool_s structure amongst themselves, * for obtaining locks on input HMM file and output histogram and * tophits structures. */struct workpool_s { /* Shared configuration resources that don't change: */ char *hmmfile; /* name of HMM file */ char *dsq; /* digitized query sequence */ char *seqname; /* sequence name */ int L; /* length of dsq */ int do_forward; /* TRUE to score using Forward */ int do_null2; /* TRUE to apply null2 correction */ struct threshold_s *thresh; /* score/evalue cutoff information */ /* Shared (mutex-protected) input resources: */ HMMFILE *hmmfp; /* ptr to open HMM file */ int nhmm; /* number of HMMs searched so far */ pthread_mutex_t input_lock; /* mutex for locking input */ /* Shared (mutex-protected) output resources: */ struct tophit_s *ghit; /* per-sequence top hits */ struct tophit_s *dhit; /* per-domain top hits */ pthread_mutex_t output_lock; /* mutex for locking output */ /* Thread pool information */ pthread_t *thread; /* our pool of threads */ int num_threads; /* number of threads */};static struct workpool_s *workpool_start(char *hmmfile, HMMFILE *hmmfp, char *dsq, char *seqname, int L, int do_forward, int do_null2, struct threshold_s *thresh, struct tophit_s *ghit, struct tophit_s *dhit, int num_threads);static void workpool_stop(struct workpool_s *wpool);static void workpool_free(struct workpool_s *wpool);static void *worker_thread(void *ptr);#endif /* HMMER_THREADS */#ifdef HMMER_PVMstatic void main_loop_pvm(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2, struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm);#endifstatic void main_loop_serial(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2, int num_threads, struct tophit_s *ghit, struct tophit_s *dhit, int *nhmm);intmain(int argc, char **argv) { char *hmmfile; /* file to read HMMs from */ HMMFILE *hmmfp; /* opened hmmfile for reading */ char *seqfile; /* file to read target sequence from */ SQFILE *sqfp; /* opened seqfile for reading */ int format; /* format of seqfile */ char *seq; /* target sequence */ SQINFO sqinfo; /* optional info for seq */ struct fancyali_s *ali; /* an alignment for display */ struct tophit_s *ghit; /* list of top hits and alignments for seq */ struct tophit_s *dhit; /* list of top hits/alignments for domains */ float sc; /* log-odds score in bits */ double pvalue; /* pvalue of an HMM score */ double evalue; /* evalue of an HMM score */ double motherp; /* pvalue of a whole seq HMM score */ float mothersc; /* score of a whole seq parent of domain */ int sqfrom, sqto; /* coordinates in sequence */ int hmmfrom, hmmto; /* coordinate in HMM */ char *name, *acc, *desc; /* hit HMM name, accession, description */ int hmmlen; /* length of HMM hit */ int nhmm; /* number of HMMs searched */ int domidx; /* number of this domain */ int ndom; /* total # of domains in this seq */ int namewidth; /* max width of printed HMM name */ int descwidth; /* max width of printed description */ int Alimit; /* A parameter limiting output alignments */ struct threshold_s thresh; /* contains all threshold (cutoff) info */ char *optname; /* name of option found by Getopt() */ char *optarg; /* argument found by Getopt() */ int optind; /* index in argv[] */ int do_forward; /* TRUE to use Forward() not Viterbi() */ int do_nucleic; /* TRUE to do DNA/RNA instead of protein */ int do_null2; /* TRUE to adjust scores with null model #2 */ int do_pvm; /* TRUE to run on PVM */ int do_xnu; /* TRUE to do XNU filtering */ int be_backwards; /* TRUE to be backwards-compatible in output*/ int show_acc; /* TRUE to sub HMM accessions for names */ int i; int nreported; int num_threads; /* number of worker threads */ /*********************************************** * Parse command line ***********************************************/ format = SQFILE_UNKNOWN; /* default: autodetect format w/ Babelfish */ do_forward = FALSE; do_nucleic = FALSE; do_null2 = TRUE; do_pvm = FALSE; do_xnu = FALSE; be_backwards= FALSE; show_acc = FALSE; Alimit = INT_MAX; /* no limit on alignment output */ thresh.globE = 10.0; /* use a reasonable Eval threshold; */ thresh.globT = -FLT_MAX; /* but no bit threshold, */ thresh.domT = -FLT_MAX; /* no domain bit threshold, */ thresh.domE = FLT_MAX; /* and no domain Eval threshold. */ thresh.autocut = CUT_NONE; /* and no Pfam cutoffs used. */ thresh.Z = 0; /* Z not preset, so determined by # of HMMs */#ifdef HMMER_THREADS num_threads = ThreadNumber(); /* only matters if we're threaded */#else num_threads = 0;#endif while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "-n") == 0) do_nucleic = TRUE; else if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg); else if (strcmp(optname, "-E") == 0) thresh.globE = atof(optarg); else if (strcmp(optname, "-T") == 0) thresh.globT = atof(optarg); else if (strcmp(optname, "-Z") == 0) thresh.Z = atoi(optarg); else if (strcmp(optname, "--acc") == 0) show_acc = TRUE; else if (strcmp(optname, "--compat") == 0) be_backwards = TRUE; else if (strcmp(optname, "--cpu") == 0) num_threads = atoi(optarg); else if (strcmp(optname, "--cut_ga") == 0) thresh.autocut = CUT_GA; else if (strcmp(optname, "--cut_nc") == 0) thresh.autocut = CUT_NC; else if (strcmp(optname, "--cut_tc") == 0) thresh.autocut = CUT_TC; else if (strcmp(optname, "--domE") == 0) thresh.domE = atof(optarg); else if (strcmp(optname, "--domT") == 0) thresh.domT = atof(optarg); else if (strcmp(optname, "--forward") == 0) do_forward = TRUE; else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE; else if (strcmp(optname, "--pvm") == 0) do_pvm = TRUE; else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE; else if (strcmp(optname, "--informat") == 0) { format = String2SeqfileFormat(optarg); if (format == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if (strcmp(optname, "-h") == 0) { Banner(stdout, banner); puts(usage); puts(experts); exit(0); } } if (argc - optind != 2) Die("Incorrect number of arguments.\n%s\n", usage); hmmfile = argv[optind++]; seqfile = argv[optind++]; #ifndef HMMER_PVM if (do_pvm) Die("PVM support is not compiled into HMMER; --pvm doesn't work.");#endif#ifndef HMMER_THREADS if (num_threads) Die("Posix threads support is not compiled into HMMER; --cpu doesn't have any effect");#endif /*********************************************** * Open sequence database (must be in curr directory); * get target sequence. ***********************************************/ if (do_nucleic) SetAlphabet(hmmNUCLEIC); else SetAlphabet(hmmAMINO); if (do_nucleic && do_xnu) Die("You can't use -n and --xnu together: I can't xnu DNA data."); if ((sqfp = SeqfileOpen(seqfile, format, NULL)) == NULL) Die("Failed to open sequence file %s\n%s\n", seqfile, usage); /*********************************************** * Open HMM database (might be in HMMERDB or current directory) ***********************************************/ if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) Die("Failed to open HMM database %s\n%s", hmmfile, usage); /*********************************************** * Show the banner ***********************************************/ Banner(stdout, banner); printf( "HMM file: %s\n", hmmfile); printf( "Sequence file: %s\n", seqfile); if (do_pvm) printf( "PVM: ACTIVE\n"); printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n"); /*********************************************** * Search each HMM against each sequence ***********************************************/ while (ReadSeq(sqfp, format, &seq, &sqinfo)) { ghit = AllocTophits(20); /* keeps full seq scores */ dhit = AllocTophits(20); /* keeps domain scores */ /* 1. Search sequence against all HMMs. * Significant scores+alignments accumulate in ghit, dhit. */ if (!do_pvm) main_loop_serial(hmmfile, hmmfp, seq, &sqinfo, &thresh, do_xnu, do_forward, do_null2, num_threads, ghit, dhit, &nhmm);#ifdef HMMER_PVM else if (do_pvm) { SQD_DPRINTF1(("Entering PVM main loop\n")); main_loop_pvm(hmmfile, hmmfp, seq, &sqinfo, &thresh, do_xnu, do_forward, do_null2, ghit, dhit, &nhmm); }#endif else Die("wait. that can't happen. I didn't do anything."); /* set Z for good now that we're done */ if (!thresh.Z) thresh.Z = nhmm; /* 2. (Done searching all HMMs for this query seq; start output) * Report the overall sequence hits, sorted by significance. */ if (be_backwards) { printf("Query: %s %s\n", sqinfo.name, sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); } else { printf("\nQuery sequence: %s\n", sqinfo.name); printf("Accession: %s\n", sqinfo.flags &SQINFO_ACC ? sqinfo.acc : "[none]"); printf("Description: %s\n", sqinfo.flags &SQINFO_DESC? sqinfo.desc : "[none]"); } /* We'll now sort the global hit list by evalue... * (not score! that was bug #12. in hmmpfam, score and evalue are not * monotonic.) */ FullSortTophits(ghit); namewidth = MAX(8, TophitsMaxName(ghit)); /* must print whole name, no truncation */ descwidth = MAX(52-namewidth, 11); /* may truncate desc, but avoid neg len! */ printf("\nScores for sequence family classification (score includes all domains):\n"); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Model", descwidth, "Description", "Score", "E-value", " N "); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---"); for (i = 0, nreported = 0; i < ghit->num; i++) { char *safedesc; GetRankedHit(ghit, i, &pvalue, &sc, NULL, NULL, &name, &acc, &desc, NULL, NULL, NULL, /* seq positions */ NULL, NULL, NULL, /* HMM positions */ NULL, &ndom, /* domain info */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -