hmmpfam.c
来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,210 行 · 第 1/4 页
C
1,210 行
/*********************************************** * Show the banner ***********************************************/ HMMERBanner(stdout, banner); printf( "HMM file: %s\n", hmmfile); printf( "Sequence file: %s\n", seqfile); if (do_pvm) printf( "PVM: ACTIVE\n"); printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n"); /*********************************************** * Search each HMM against each sequence ***********************************************/ while (ReadSeq(sqfp, format, &seq, &sqinfo)) { ghit = AllocTophits(20); /* keeps full seq scores */ dhit = AllocTophits(20); /* keeps domain scores */ /* 1. Search sequence against all HMMs. * Significant scores+alignments accumulate in ghit, dhit. */ if (pvm_support && do_pvm) main_loop_pvm(hmmfile, hmmfp, seq, &sqinfo, &thresh, do_xnu, do_forward, do_null2, ghit, dhit, &nhmm); else if (threads_support && num_threads) main_loop_threaded(hmmfile, hmmfp, seq, &sqinfo, &thresh, do_xnu, do_forward, do_null2, num_threads, ghit, dhit, &nhmm); else main_loop_serial(hmmfile, hmmfp, seq, &sqinfo, &thresh, do_xnu, do_forward, do_null2, ghit, dhit, &nhmm); /* set Z for good now that we're done */ if (!thresh.Z) thresh.Z = nhmm; /* 2. (Done searching all HMMs for this query seq; start output) * Report the overall sequence hits, sorted by significance. */ if (be_backwards) { printf("Query: %s %s\n", sqinfo.name, sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); } else { printf("\nQuery sequence: %s\n", sqinfo.name); printf("Accession: %s\n", sqinfo.flags &SQINFO_ACC ? sqinfo.acc : "[none]"); printf("Description: %s\n", sqinfo.flags &SQINFO_DESC? sqinfo.desc : "[none]"); } /* We'll now sort the global hit list by evalue... * (not score! that was bug #12. in hmmpfam, score and evalue are not * monotonic.) */ FullSortTophits(ghit); namewidth = MAX(8, TophitsMaxName(ghit)); /* must print whole name, no truncation */ descwidth = MAX(52-namewidth, 11); /* may truncate desc, but avoid neg len! */ printf("\nScores for sequence family classification (score includes all domains):\n"); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Model", descwidth, "Description", "Score", "E-value", " N "); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---"); for (i = 0, nreported = 0; i < ghit->num; i++) { char *safedesc; GetRankedHit(ghit, i, &pvalue, &sc, NULL, NULL, &name, &acc, &desc, NULL, NULL, NULL, /* seq positions */ NULL, NULL, NULL, /* HMM positions */ NULL, &ndom, /* domain info */ NULL); /* alignment info*/ evalue = pvalue * (double) thresh.Z; /* safedesc is a workaround for an apparent Linux printf() * bug with the *.*s format. dbmalloc crashes with a memchr() ptr out of bounds * flaw if the malloc'ed space for desc is short. The workaround * is to make sure the ptr for *.* has a big malloc space. */ if (desc != NULL && strlen(desc) < 80) { safedesc = MallocOrDie(sizeof(char) * 80); strcpy(safedesc, desc); } else safedesc = Strdup(desc); /* sneaky trick warning: * if we're using dynamic Pfam score cutoffs (GA, TC, NC), * then the list of hits is already correct and does not * need any score cutoffs. Unset the thresholds. They'll * be reset in the main_loop if we still have sequences * to process. */ if (thresh.autocut != CUT_NONE) { thresh.globE = thresh.domE = FLT_MAX; thresh.globT = thresh.domT = -FLT_MAX; } if (evalue <= thresh.globE && sc >= thresh.globT) { printf("%-*s %-*.*s %7.1f %10.2g %3d\n", namewidth, (show_acc && acc != NULL) ? acc : name, descwidth, descwidth, safedesc != NULL ? safedesc : "", sc, evalue, ndom); nreported++; } free(safedesc); } if (nreported == 0) printf("\t[no hits above thresholds]\n"); /* 3. Report domain hits (sorted on sqto coordinate) */ FullSortTophits(dhit); namewidth = MAX(8, TophitsMaxName(dhit)); /* must print whole name, no truncation */ printf("\nParsed for domains:\n"); printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", namewidth, "Model", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value"); printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------"); for (i = 0, nreported = 0; i < dhit->num; i++) { GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc, &name, &acc, NULL, &sqfrom, &sqto, NULL, &hmmfrom, &hmmto, &hmmlen, &domidx, &ndom, NULL); evalue = pvalue * (double) thresh.Z; /* Does the "mother" (complete) sequence satisfy global thresholds? */ if (motherp * (double)thresh. Z > thresh.globE || mothersc < thresh.globT) continue; else if (evalue <= thresh.domE && sc >= thresh.domT) { printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n", namewidth, (show_acc && acc != NULL) ? acc : name, domidx, ndom, sqfrom, sqto, sqfrom == 1 ? '[' : '.', sqto == sqinfo.len ? ']' : '.', hmmfrom, hmmto, hmmfrom == 1 ? '[':'.', hmmto == hmmlen ? ']' : '.', sc, evalue); nreported++; } } if (nreported == 0) printf("\t[no hits above thresholds]\n"); /* 3. Alignment output, also by domain. * dhits is already sorted and namewidth is set, from above code. * Number of displayed alignments is limited by Alimit parameter; * also by domE (evalue threshold), domT (score theshold). */ if (Alimit != 0) { printf("\nAlignments of top-scoring domains:\n"); for (i = 0, nreported = 0; i < dhit->num; i++) { if (nreported == Alimit) break; /* limit to Alimit output alignments */ GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc, &name, &acc, NULL, &sqfrom, &sqto, NULL, /* seq position info */ &hmmfrom, &hmmto, &hmmlen, /* HMM position info */ &domidx, &ndom, /* domain info */ &ali); /* alignment info */ evalue = pvalue * (double) thresh.Z; if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) continue; else if (evalue <= thresh.domE && sc >= thresh.domT) { printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n", (show_acc && acc != NULL) ? acc : name, domidx, ndom, sqfrom, sqto, sc, evalue); PrintFancyAli(stdout, ali); nreported++; } } if (nreported == 0) printf("\t[no hits above thresholds]\n"); if (nreported == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit); } printf("//\n"); FreeSequence(seq, &sqinfo); FreeTophits(ghit); FreeTophits(dhit); HMMFileRewind(hmmfp); } /*********************************************** * Clean-up and exit. ***********************************************/ SeqfileClose(sqfp); HMMFileClose(hmmfp); SqdClean(); return 0;}/* Function: main_loop_serial() * Date: SRE, Fri Aug 7 13:46:48 1998 [St. Louis] * * Purpose: Search a sequence against an HMM database; * main loop for the serial (non-PVM, non-threads) * version. * * On return, ghit and dhit contain info for all hits * that satisfy the set thresholds. If an evalue * cutoff is used at all, the lists will be overestimated -- * because the evalue will be underestimated until * we know the final Z. (Thus the main program must recheck * thresholds before printing any results.) If only * score cutoffs are used, then the lists are correct, * and may be printed exactly as they come (after * appropriate sorting, anyway). This is especially * important for dynamic thresholding using Pfam * score cutoffs -- the main caller cannot afford to * rescan the HMM file just to get the GA/TC/NC cutoffs * back out for each HMM, and neither do I want to * burn the space to store them as I make a pass thru * Pfam. * * Args: hmmfile - name of HMM file * hmmfp - open HMM file (and at start of file) * dsq - digitized sequence * sqinfo - ptr to SQINFO optional info for dsq * thresh - score/evalue threshold information * do_xnu - TRUE to apply XNU filter to sequence * do_forward - TRUE to use Forward() scores * do_null2 - TRUE to adjust scores w/ ad hoc null2 model * num_threads- number of threads, if threaded * ghit - global hits list * dhit - domain hits list * ret_nhmm - number of HMMs searched. * * Returns: (void) */static voidmain_loop_serial(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2, struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm) { unsigned char *dsq; /* digitized sequence */ int nhmm; /* number of HMMs searched */ struct plan7_s *hmm; /* current HMM to search with */ struct p7trace_s *tr; /* traceback of alignment */ struct dpmatrix_s *mx; /* growable DP matrix */ float sc; /* an alignment score */ double pvalue; /* pvalue of an HMM score */ double evalue; /* evalue of an HMM score */ /* Prepare sequence. */ SQD_DPRINTF1(("main_loop_serial:\n")); dsq = DigitizeSequence(seq, sqinfo->len); if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo->len); /* * We'll create for at least N=300xM=300, and thus consume at least 1 MB, * regardless of RAMLIMIT -- this helps us avoid reallocating some weird * asymmetric matrices. * * We're growable in both M and N, because inside of P7SmallViterbi, * we're going to be calling P7Viterbi on subpieces that vary in size, * and for different models. */ mx = CreatePlan7Matrix(300, 300, 25, 25); nhmm = 0; while (HMMFileRead(hmmfp, &hmm)) { if (hmm == NULL) Die("HMM file %s may be corrupt or in incorrect format; parse failed", hmmfile); P7Logoddsify(hmm, !(do_forward)); SQD_DPRINTF1((" ... working on HMM %s\n", hmm->name)); SQD_DPRINTF1((" ... mx is now M=%d by N=%d\n", mx->maxM, mx->maxN)); if (! SetAutocuts(thresh, hmm)) Die("HMM %s did not contain the GA, TC, or NC cutoffs you needed", hmm->name); /* Score sequence, do alignment (Viterbi), recover trace */ if (P7ViterbiSpaceOK(sqinfo->len, hmm->M, mx)) { SQD_DPRINTF1((" ... using normal P7Viterbi(); size ~%d MB\n", P7ViterbiSize(sqinfo->len, hmm->M))); sc = P7Viterbi(dsq, sqinfo->len, hmm, mx, &tr); } else {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?