hmmpfam.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,210 行 · 第 1/4 页

C
1,210
字号
  /***********************************************    * Show the banner   ***********************************************/  HMMERBanner(stdout, banner);  printf(   "HMM file:                 %s\n", hmmfile);  printf(   "Sequence file:            %s\n", seqfile);  if (do_pvm)    printf( "PVM:                      ACTIVE\n");  printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n");  /***********************************************    * Search each HMM against each sequence   ***********************************************/  while (ReadSeq(sqfp, format, &seq, &sqinfo))     {      ghit = AllocTophits(20);   /* keeps full seq scores */      dhit = AllocTophits(20);   /* keeps domain scores   */      /* 1. Search sequence against all HMMs.       *    Significant scores+alignments accumulate in ghit, dhit.       */      if (pvm_support && do_pvm)	main_loop_pvm(hmmfile, hmmfp, seq, &sqinfo, 		      &thresh, do_xnu, do_forward, do_null2,		      ghit, dhit, &nhmm);      else if (threads_support && num_threads)	main_loop_threaded(hmmfile, hmmfp, seq, &sqinfo, 			   &thresh, do_xnu, do_forward, do_null2, num_threads,			   ghit, dhit, &nhmm);      else 	main_loop_serial(hmmfile, hmmfp, seq, &sqinfo, 			 &thresh, do_xnu, do_forward, do_null2, 			 ghit, dhit, &nhmm);				/* set Z for good now that we're done */      if (!thresh.Z) thresh.Z = nhmm;	      /* 2. (Done searching all HMMs for this query seq; start output)       *    Report the overall sequence hits, sorted by significance.       */      if (be_backwards)	{	  printf("Query:  %s  %s\n", sqinfo.name, 		 sqinfo.flags & SQINFO_DESC ? sqinfo.desc : "");	}      else	{	  printf("\nQuery sequence: %s\n", sqinfo.name);	  printf("Accession:      %s\n", sqinfo.flags &SQINFO_ACC ? sqinfo.acc  : "[none]");	  printf("Description:    %s\n", sqinfo.flags &SQINFO_DESC? sqinfo.desc : "[none]");	}      /* We'll now sort the global hit list by evalue...        * (not score! that was bug #12. in hmmpfam, score and evalue are not        *  monotonic.)       */      FullSortTophits(ghit);	      namewidth = MAX(8, TophitsMaxName(ghit)); /* must print whole name, no truncation  */      descwidth = MAX(52-namewidth, 11);        /* may truncate desc, but avoid neg len! */      printf("\nScores for sequence family classification (score includes all domains):\n");      printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Model", descwidth, "Description", "Score", "E-value", " N ");      printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---");      for (i = 0, nreported = 0; i < ghit->num; i++)	{	  char *safedesc;	  GetRankedHit(ghit, i, 		       &pvalue, &sc, NULL, NULL,		       &name, &acc, &desc,		       NULL, NULL, NULL,           /* seq positions */		       NULL, NULL, NULL,           /* HMM positions */		       NULL, &ndom,                /* domain info   */		       NULL);	                   /* alignment info*/	  evalue = pvalue * (double) thresh.Z;	  /* safedesc is a workaround for an apparent Linux printf()	   * bug with the *.*s format. dbmalloc crashes with a memchr() ptr out of bounds	   * flaw if the malloc'ed space for desc is short. The workaround           * is to make sure the ptr for *.* has a big malloc space.	   */	  if (desc != NULL && strlen(desc) < 80) 	    {	      safedesc = MallocOrDie(sizeof(char) * 80);	      strcpy(safedesc, desc);	    }	  else safedesc = Strdup(desc);	  /* sneaky trick warning:	   * if we're using dynamic Pfam score cutoffs (GA, TC, NC),	   * then the list of hits is already correct and does not	   * need any score cutoffs. Unset the thresholds. They'll           * be reset in the main_loop if we still have sequences           * to process.	   */	  if (thresh.autocut != CUT_NONE) {	    thresh.globE = thresh.domE = FLT_MAX;	    thresh.globT = thresh.domT = -FLT_MAX;	  }	  if (evalue <= thresh.globE && sc >= thresh.globT) 	    {	      printf("%-*s %-*.*s %7.1f %10.2g %3d\n", 		     namewidth, 		     (show_acc && acc != NULL) ?  acc : name,		     descwidth, descwidth, safedesc != NULL ? safedesc : "",		     sc, evalue, ndom);	      nreported++;	    }	  free(safedesc);	}      if (nreported == 0) printf("\t[no hits above thresholds]\n");      /* 3. Report domain hits (sorted on sqto coordinate)       */      FullSortTophits(dhit);      namewidth = MAX(8, TophitsMaxName(dhit)); /* must print whole name, no truncation  */      printf("\nParsed for domains:\n");      printf("%-*s %7s %5s %5s    %5s %5s    %7s %8s\n",	     namewidth, "Model", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value");      printf("%-*s %7s %5s %5s    %5s %5s    %7s %8s\n",	     namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------");            for (i = 0, nreported = 0; i < dhit->num; i++)	{	  GetRankedHit(dhit, i, 		       &pvalue, &sc, &motherp, &mothersc,		       &name, &acc, NULL,		       &sqfrom, &sqto, NULL, 		       &hmmfrom, &hmmto, &hmmlen, 		       &domidx, &ndom,		       NULL);	  evalue = pvalue * (double) thresh.Z;	  				/* Does the "mother" (complete) sequence satisfy global thresholds? */	  if (motherp * (double)thresh. Z > thresh.globE || mothersc < thresh.globT) 	    continue;	  else if (evalue <= thresh.domE && sc >= thresh.domT) {	    printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n",		   namewidth, 		   (show_acc && acc != NULL) ?  acc : name,		   domidx, ndom,		   sqfrom, sqto, 		   sqfrom == 1 ? '[' : '.', sqto == sqinfo.len ? ']' : '.',		   hmmfrom, hmmto,		   hmmfrom == 1 ? '[':'.',  hmmto == hmmlen ? ']' : '.',		   sc, evalue);	    nreported++;	  }	}      if (nreported == 0) printf("\t[no hits above thresholds]\n");            /* 3. Alignment output, also by domain.       *    dhits is already sorted and namewidth is set, from above code.       *    Number of displayed alignments is limited by Alimit parameter;       *    also by domE (evalue threshold), domT (score theshold).       */      if (Alimit != 0)	{	  printf("\nAlignments of top-scoring domains:\n");	  for (i = 0, nreported = 0; i < dhit->num; i++)	    {	      if (nreported == Alimit) break; /* limit to Alimit output alignments */	      GetRankedHit(dhit, i, 			   &pvalue, &sc, &motherp, &mothersc,			   &name, &acc, NULL,			   &sqfrom, &sqto, NULL,              /* seq position info  */			   &hmmfrom, &hmmto, &hmmlen,         /* HMM position info  */			   &domidx, &ndom,                    /* domain info        */			   &ali);	                      /* alignment info     */	      evalue = pvalue * (double) thresh.Z;	      if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) 		continue;	      else if (evalue <= thresh.domE && sc >= thresh.domT) 		{		  printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n", 			 (show_acc && acc != NULL) ?  acc : name,			 domidx, ndom, sqfrom, sqto, sc, evalue);		  PrintFancyAli(stdout, ali);		  nreported++;		}	    }	  if (nreported == 0)      printf("\t[no hits above thresholds]\n");	  if (nreported == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit);	}      printf("//\n");      FreeSequence(seq, &sqinfo);       FreeTophits(ghit);      FreeTophits(dhit);      HMMFileRewind(hmmfp);    }  /***********************************************    * Clean-up and exit.   ***********************************************/  SeqfileClose(sqfp);  HMMFileClose(hmmfp);  SqdClean();  return 0;}/* Function: main_loop_serial() * Date:     SRE, Fri Aug  7 13:46:48 1998 [St. Louis] * * Purpose:  Search a sequence against an HMM database; *           main loop for the serial (non-PVM, non-threads) *           version.  *            *           On return, ghit and dhit contain info for all hits *           that satisfy the set thresholds. If an evalue *           cutoff is used at all, the lists will be overestimated -- *           because the evalue will be underestimated until *           we know the final Z. (Thus the main program must recheck *           thresholds before printing any results.) If only *           score cutoffs are used, then the lists are correct, *           and may be printed exactly as they come (after  *           appropriate sorting, anyway). This is especially *           important for dynamic thresholding using Pfam *           score cutoffs -- the main caller cannot afford to *           rescan the HMM file just to get the GA/TC/NC cutoffs *           back out for each HMM, and neither do I want to *           burn the space to store them as I make a pass thru *           Pfam. * * Args:     hmmfile - name of HMM file *           hmmfp   - open HMM file (and at start of file) *           dsq     - digitized sequence *           sqinfo  - ptr to SQINFO optional info for dsq *           thresh  - score/evalue threshold information *           do_xnu     - TRUE to apply XNU filter to sequence *           do_forward - TRUE to use Forward() scores *           do_null2   - TRUE to adjust scores w/ ad hoc null2 model *           num_threads- number of threads, if threaded *           ghit    - global hits list *           dhit    - domain hits list *           ret_nhmm    - number of HMMs searched. * * Returns:  (void) */static voidmain_loop_serial(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, 		 struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2,		 struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm) {  unsigned char     *dsq;       /* digitized sequence                      */  int                nhmm;	/* number of HMMs searched                 */  struct plan7_s    *hmm;       /* current HMM to search with              */   struct p7trace_s  *tr;	/* traceback of alignment                  */  struct dpmatrix_s *mx;        /* growable DP matrix                      */  float   sc;                   /* an alignment score                      */   double  pvalue;		/* pvalue of an HMM score                  */  double  evalue;		/* evalue of an HMM score                  */  /* Prepare sequence.   */  SQD_DPRINTF1(("main_loop_serial:\n"));  dsq = DigitizeSequence(seq, sqinfo->len);  if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo->len);  /*    * We'll create for at least N=300xM=300, and thus consume at least 1 MB,   * regardless of RAMLIMIT -- this helps us avoid reallocating some weird   * asymmetric matrices.   *    * We're growable in both M and N, because inside of P7SmallViterbi,   * we're going to be calling P7Viterbi on subpieces that vary in size,   * and for different models.   */  mx = CreatePlan7Matrix(300, 300, 25, 25);  nhmm = 0;  while (HMMFileRead(hmmfp, &hmm)) {    if (hmm == NULL)       Die("HMM file %s may be corrupt or in incorrect format; parse failed", hmmfile);    P7Logoddsify(hmm, !(do_forward));    SQD_DPRINTF1(("   ... working on HMM %s\n", hmm->name));    SQD_DPRINTF1(("   ... mx is now M=%d by N=%d\n", mx->maxM, mx->maxN));    if (! SetAutocuts(thresh, hmm))       Die("HMM %s did not contain the GA, TC, or NC cutoffs you needed",	  hmm->name);    /* Score sequence, do alignment (Viterbi), recover trace     */    if (P7ViterbiSpaceOK(sqinfo->len, hmm->M, mx))      {	SQD_DPRINTF1(("   ... using normal P7Viterbi(); size ~%d MB\n", 		      P7ViterbiSize(sqinfo->len, hmm->M)));	sc = P7Viterbi(dsq, sqinfo->len, hmm, mx, &tr);      }    else      {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?