📄 hmmsearch.c
字号:
printf("\nQuery HMM: %s\n", hmm->name); printf("Accession: %s\n", hmm->flags & PLAN7_ACC ? hmm->acc : "[none]"); printf("Description: %s\n", hmm->flags & PLAN7_DESC ? hmm->desc : "[none]"); } if (hmm->flags & PLAN7_STATS) printf(" [HMM has been calibrated; E-values are empirical estimates]\n"); else printf(" [No calibration for HMM; E-values are upper bounds]\n"); FullSortTophits(ghit); namewidth = MAX(8, TophitsMaxName(ghit)); /* cannot truncate name. */ descwidth = MAX(52-namewidth, 11);/* may truncate desc, but need strlen("Description") */ printf("\nScores for complete sequences (score includes all domains):\n"); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Sequence", descwidth, "Description", "Score", "E-value", " N "); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---"); for (i = 0, nreported = 0; i < ghit->num; i++) { char *safedesc; GetRankedHit(ghit, i, &pvalue, &sc, NULL, NULL, &name, NULL, &desc, NULL, NULL, NULL, /* sequence positions */ NULL, NULL, NULL, /* HMM positions */ NULL, &ndom, /* domain info */ NULL); /* alignment info */ evalue = pvalue * (double) thresh.Z; /* safedesc is a workaround for an apparent Linux printf() * bug with the *.*s format. dbmalloc crashes with a memchr() ptr out of bounds * flaw if the malloc'ed space for desc is short. The workaround * is to make sure the ptr for *.* has a big malloc space. */ if (desc != NULL && strlen(desc) < 80) { safedesc = MallocOrDie(sizeof(char) * 80); strcpy(safedesc, desc); } else safedesc = Strdup(desc); if (evalue <= thresh.globE && sc >= thresh.globT) { printf("%-*s %-*.*s %7.1f %10.2g %3d\n", namewidth, name, descwidth, descwidth, safedesc != NULL ? safedesc : "", sc, evalue, ndom); nreported++; } free(safedesc); } if (nreported == 0) printf("\t[no hits above thresholds]\n"); /* 2. Report domain hits (also sorted on E-value) */ FullSortTophits(dhit); namewidth = MAX(8, TophitsMaxName(dhit)); printf("\nParsed for domains:\n"); printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", namewidth, "Sequence", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value"); printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------"); for (i = 0, nreported = 0; i < dhit->num; i++) { GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc, &name, NULL, NULL, &sqfrom, &sqto, &sqlen, /* seq position info */ &hmmfrom, &hmmto, NULL, /* HMM position info */ &domidx, &ndom, /* domain info */ NULL); /* alignment info */ evalue = pvalue * (double) thresh.Z; if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) continue; else if (evalue <= thresh.domE && sc >= thresh.domT) { printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n", namewidth, name, domidx, ndom, sqfrom, sqto, sqfrom == 1 ? '[' : '.', sqto == sqlen ? ']' : '.', hmmfrom, hmmto, hmmfrom == 1 ? '[':'.', hmmto == hmm->M ? ']' : '.', sc, evalue); nreported++; } } if (nreported == 0) printf("\t[no hits above thresholds]\n"); /* 3. Alignment output, also by domain. * dhits is already sorted and namewidth is set, from above code. * Number of displayed alignments is limited by Alimit parameter; * also by domE (evalue threshold), domT (score theshold). */ if (Alimit != 0) { printf("\nAlignments of top-scoring domains:\n"); for (i = 0, nreported = 0; i < dhit->num; i++) { if (nreported == Alimit) break; /* limit to Alimit output alignments */ GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc, &name, NULL, NULL, &sqfrom, &sqto, &sqlen, /* seq position info */ &hmmfrom, &hmmto, NULL, /* HMM position info */ &domidx, &ndom, /* domain info */ &ali); /* alignment info */ evalue = pvalue * (double) thresh.Z; if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) continue; else if (evalue <= thresh.domE && sc >= thresh.domT) { printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n", name, domidx, ndom, sqfrom, sqto, sc, evalue); PrintFancyAli(stdout, ali); nreported++; } } if (nreported == 0) printf("\t[no hits above thresholds]\n"); if (nreported == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit); } /* 4. Histogram output */ printf("\nHistogram of all scores:\n"); PrintASCIIHistogram(stdout, histogram); /* 5. Tophits summaries, while developing... */ printf("\nTotal sequences searched: %d\n", nseq); printf("\nWhole sequence top hits:\n"); TophitsReport(ghit, thresh.globE, nseq); printf("\nDomain top hits:\n"); TophitsReport(dhit, thresh.domE, nseq); /*********************************************** * Clean-up and exit. ***********************************************/ FreeHistogram(histogram); HMMFileClose(hmmfp); SeqfileClose(sqfp); FreeTophits(ghit); FreeTophits(dhit); FreePlan7(hmm); SqdClean(); return 0;}/* Function: main_loop_serial() * Date: SRE, Wed Sep 23 10:20:49 1998 [St. Louis] * * Purpose: Search an HMM against a sequence database. * main loop for the serial (non-PVM, non-threads) * version. * * In: HMM and open sqfile, plus options * Out: histogram, global hits list, domain hits list, nseq. * * Args: hmm - the HMM to search with. * sqfp - open SQFILE for sequence database * thresh - score/evalue threshold info * do_forward - TRUE to score using Forward() * do_null2 - TRUE to use ad hoc null2 score correction * do_xnu - TRUE to apply XNU mask * num_threads- number of worker threads to start, or 0 * histogram - RETURN: score histogram * ghit - RETURN: ranked global scores * dhit - RETURN: ranked domain scores * ret_nseq - RETURN: actual number of seqs searched * * Returns: (void) */static voidmain_loop_serial(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, int do_null2, int do_xnu, int num_threads, struct histogram_s *histogram, struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq){#ifdef HMMER_THREADS struct workpool_s *wpool; /* pool of worker threads */#else struct p7trace_s *tr; /* traceback */ char *seq; /* target sequence */ char *dsq; /* digitized target sequence */ SQINFO sqinfo; /* optional info for seq */ float sc; /* score of an HMM search */ double pvalue; /* pvalue of an HMM score */ double evalue; /* evalue of an HMM score */#endif int nseq; /* number of sequences searched */ #ifdef HMMER_THREADS wpool = workpool_start(hmm, sqfp, do_xnu, do_forward, do_null2, thresh, ghit, dhit, histogram, num_threads); workpool_stop(wpool); nseq = wpool->nseq; workpool_free(wpool);#else /* unthreaded code: */ nseq = 0; while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) { /* Silently skip length 0 seqs. * What, you think this doesn't occur? Welcome to genomics, * young grasshopper. */ if (sqinfo.len == 0) continue; nseq++; dsq = DigitizeSequence(seq, sqinfo.len); if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo.len); /* 1. Recover a trace by Viterbi. */ if (P7ViterbiSize(sqinfo.len, hmm->M) <= RAMLIMIT) sc = P7Viterbi(dsq, sqinfo.len, hmm, &tr); else sc = P7SmallViterbi(dsq, sqinfo.len, hmm, &tr); /* 2. If we're using Forward scores, calculate the * whole sequence score; this overrides anything * PostprocessSignificantHit() is going to do to the per-seq score. */ if (do_forward) { sc = P7Forward(dsq, sqinfo.len, hmm, NULL); if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq); }#if DEBUGLEVEL >= 2 P7PrintTrace(stdout, tr, hmm, dsq); #endif /* 2. Store score/pvalue for global alignment; will sort on score, * which in hmmsearch is monotonic with E-value. * Keep all domains in a significant sequence hit. * We can only make a lower bound estimate of E-value since * we don't know the final value of nseq yet, so the list * of hits we keep in memory is >= the list we actually * output. */ pvalue = PValue(hmm, sc); evalue = thresh->Z ? (double) thresh->Z * pvalue : (double) nseq * pvalue; if (sc >= thresh->globT && evalue <= thresh->globE) { PostprocessSignificantHit(ghit, dhit, tr, hmm, dsq, sqinfo.len, sqinfo.name, sqinfo.flags & SQINFO_ACC ? sqinfo.acc : NULL, sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL, do_forward, sc, do_null2, thresh, FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */ } AddToHistogram(histogram, sc); FreeSequence(seq, &sqinfo); P7FreeTrace(tr); free(dsq); }#endif *ret_nseq = nseq; return;}#ifdef HMMER_PVM/***************************************************************** * PVM specific functions ****************************************************************/ /* Function: main_loop_pvm() * Date: SRE, Wed Sep 23 10:36:44 1998 [St. Louis] * * Purpose: Search an HMM against a sequence database. * main loop for the PVM version. * * In: HMM and open sqfile, plus options * Out: histogram, global hits list, domain hits list, nseq. * * Args: hmm - the HMM to search with. scoring form. * sqfp - open SQFILE for sequence database * thresh - score/evalue threshold information * do_forward - TRUE to score using Forward() * do_null2 - TRUE to use ad hoc null2 score correction * do_xnu - TRUE to apply XNU mask * histogram - RETURN: score histogram * ghit - RETURN: ranked global scores * dhit - RETURN: ranked domain scores * ret_nseq - RETURN: actual number of seqs searched * * Returns: (void) */static voidmain_loop_pvm(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, int do_null2, int do_xnu, struct histogram_s *histogram, struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq){ char *seq; /* target sequence */ char *dsq; /* digitized target seq */ SQINFO sqinfo; /* optional info about target seq */ int master_tid; /* master's (my) PVM TID */ int *slave_tid; /* array of slave TID's */ int nslaves; /* number of slaves */ int code; /* status code rec'd from a slave */ int nseq; /* number of sequences searched */ int sent_trace; /* TRUE if slave gave us a trace */ char **dsqlist; /* remember what seqs slaves are doing */ char **namelist; /* remember what seq names slaves are doing */ char **acclist ; /* remember what seq accessions slaves are doing */ char **desclist; /* remember what seq desc's slaves are doing */ int *lenlist; /* remember lengths of seqs slaves are doing */ int slaveidx; /* counter for slaves */ float sc; /* score of an alignment */ double pvalue; /* P-value of a score of an alignment */ struct p7trace_s *tr; /* Viterbi traceback of an alignment */ int i; /* generic counter */ /* Initialize PVM. */ SQD_DPRINTF1(("Requesting master TID...\n")); master_tid = pvm_mytid();#if DEBUGLEVEL >= 1 pvm_catchout(stderr); /* catch output for debugging */#endif SQD_DPRINTF1(("Spawning slaves...\n")); PVMSpawnSlaves("hmmsearch-pvm", &slave_tid, &nslaves); SQD_DPRINTF1(("Spawned a total of %d slaves...\n", nslaves)); /* Initialize the slaves by broadcast. */ SQD_DPRINTF1(("Broadcasting to %d slaves...\n", nslaves)); pvm_initsend(PvmDataDefault); pvm_pkfloat(&(thresh->globT), 1, 1); pvm_pkdouble(&(thresh->globE), 1, 1); pvm_pkint(&(thresh->Z), 1, 1); pvm_pkint(&do_forward, 1, 1); pvm_pkint(&do_null2, 1, 1); pvm_pkint(&Alphabet_type, 1, 1); PVMPackHMM(hmm); pvm_mcast(slave_tid, nslaves, HMMPVM_INIT); SQD_DPRINTF1(("Slaves should be ready...\n")); /* Confirm slaves' OK status. */ PVMConfirmSlaves(slave_tid, nslaves); SQD_DPRINTF1(("Slaves confirm that they're ok...\n")); /* Alloc arrays for remembering what seq each * slave was working on. */ namelist = MallocOrDie(sizeof(char *) * nslaves); acclist = MallocOrDie(sizeof(char *) * nslaves); desclist = MallocOrDie(sizeof(char *) * nslaves); dsqlist = MallocOrDie(sizeof(char *) * nslaves); lenlist = MallocOrDie(sizeof(int) * nslaves); /* Load the slaves. * Give them all a sequence number and a digitized sequence * to work on. * A side effect of the seq number is that we assign each slave
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -