⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hmmbuild.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 3 页
字号:
      /* Make alignment upper case, because some symbol counting       * things are case-sensitive.       */      for (idx = 0; idx < msa->nseq; idx++)	s2upper(msa->aseq[idx]);      /* Set up the alphabet globals:       * either already set by --amino or --nucleic, or       * we guess based on the first alignment we see       */      if (Alphabet_type == hmmNOTSETYET) 	DetermineAlphabet(msa->aseq, msa->nseq);      /* Do some initialization the first time through.       * This code must be delayed until after we've seen the       * first alignment, because we have to see the alphabet type first       */      if (nali == 0) 	{				/* Set up Dirichlet priors */	  if (prifile == NULL)  pri = P7DefaultPrior();	  else                  pri = P7ReadPrior(prifile);	  if (pamfile != NULL)  PAMPrior(pamfile, pri, pamwgt);				/* Set up the null/random seq model */	  if (rndfile == NULL)  P7DefaultNullModel(randomseq, &p1);	  else                  P7ReadNullModel(rndfile, randomseq, &p1);	}      /* Prepare unaligned digitized sequences for internal use        */      DigitizeAlignment(msa, &dsq);        /* In some respects we treat DNA more crudely for now;       * for example, we can't do eff seq #, because it's       * calibrated for protein.       */      if (Alphabet_type == hmmNUCLEIC) 	do_eff = FALSE;	      /* Determine "effective sequence number".       * The BlosumWeights() routine is now an efficient O(N)       * memory clustering algorithm that doesn't blow up on,       * say, Pfam's GP120 alignment (13000+ sequences)       */      eff_nseq = (float) msa->nseq;      if (do_eff)	{	  float *wgt;	  printf("%-40s ... ", "Determining effective sequence number");	  fflush(stdout);				/* dummy weights array to feed BlosumWeights*/	  wgt = MallocOrDie(sizeof(float) * msa->nseq);	  BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt);	  eff_nseq = FSum(wgt, msa->nseq);	  free(wgt);	  printf("done. [%.0f]\n", eff_nseq);	}      /* Weight the sequences (optional),       */      if (w_strategy == WGT_GSC     || 	  w_strategy == WGT_BLOSUM  || 	  w_strategy == WGT_VORONOI ||	  w_strategy == WGT_PB)	{	  printf("%-40s ... ", "Weighting sequences heuristically");	  fflush(stdout);	  if (w_strategy != WGT_PB && msa->nseq >= pbswitch)	    {	      printf("[big alignment! doing PB]... ");	      PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);	    }	  else if (w_strategy == WGT_GSC) 	    GSCWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);	  else if (w_strategy == WGT_BLOSUM)	    BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, msa->wgt);	  else if (w_strategy == WGT_PB)	    PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);	  else if (w_strategy ==  WGT_VORONOI)	    VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); 	  printf("done.\n");	}      /* Set the effective sequence number (if do_eff is FALSE, eff_nseq        * was set to nseq).       */      FNorm(msa->wgt,  msa->nseq);      FScale(msa->wgt, msa->nseq, eff_nseq);      /* Build a model architecture.       * If we're not doing MD or ME, that's all we need to do.       * We get an allocated, counts-based HMM back.       *        * Because the architecture algorithms are allowed to change       * gap characters in the alignment, we have to calculate the       * alignment checksum before we enter the algorithms.       */      printf("%-40s ... ", "Constructing model architecture");      fflush(stdout);      checksum = GCGMultchecksum(msa->aseq, msa->nseq);      if (c_strategy == P7_FAST_CONSTRUCTION)	P7Fastmodelmaker(msa, dsq, gapmax, &hmm, &tr);      else if (c_strategy == P7_HAND_CONSTRUCTION)	P7Handmodelmaker(msa, dsq, &hmm, &tr);      else	P7Maxmodelmaker(msa, dsq, gapmax, 			pri, randomseq, p1, archpri, &hmm, &tr);      hmm->checksum = checksum;      printf("done.\n");      /* Save the count vectors if asked. Used primarily for       * making the data files for training priors.       */      if (cfile != NULL) 	{	  printf("%-40s ... ", "Saving count vector file");	  fflush(stdout);	  save_countvectors(cfp, 			    (msa->name != NULL ? msa->name : "-"),			    hmm); 	  printf("done. [%s]\n", cfile);	}      /* Record the null model in the HMM;       * add prior contributions in pseudocounts and renormalize.       */      printf("%-40s ... ", "Converting counts to probabilities");      fflush(stdout);      Plan7SetNullModel(hmm, randomseq, p1);      P7PriorifyHMM(hmm, pri);      printf("done.\n");      /* Model configuration, temporary.       * hmmbuild assumes that it's given an alignment of single domains,       * and the alignment may contain fragments. So, for the purpose of       * scoring the sequences (or, optionally, MD/ME weighting),       * configure the model into hmmsw mode. Later we'll       * configure the model according to how the user wants to       * use it.       */      Plan7SWConfig(hmm, 0.5, 0.5);      /* Do model-dependent "weighting" strategies.       */      if (w_strategy == WGT_ME)	{	  printf("\n%-40s ...\n", "Maximum entropy weighting, iterative");	  maximum_entropy(hmm, dsq, msa, eff_nseq, pri, tr);	  printf("----------------------------------------------\n\n");	}      /* Give the model a name.       * We deal with this differently depending on whether       * we're in an alignment database or a single alignment.       *        * If a single alignment, priority is:       *      1. Use -n <name> if set.       *      2. Use msa->name (avail in Stockholm or SELEX formats only)       *      3. If all else fails, use alignment file name without       *         filename extension (e.g. "globins.slx" gets named "globins"       *                * If a multiple MSA database (e.g. Stockholm/Pfam),        * only msa->name is applied. -n is not allowed.       * if msa->name is unavailable, or -n was used,       * a fatal error is thrown.       *        * Because we can't tell whether we've got more than one       * alignment 'til we're on the second one, these fatal errors       * only happen after the first HMM has already been built.       * Oh well.       */      printf("%-40s ... ", "Setting model name, etc.");      fflush(stdout);      if (nali == 0)		/* first (only?) HMM in file:  */	{	  if      (setname != NULL)   name = Strdup(setname);	  else if (msa->name != NULL) name = Strdup(msa->name);	  else                        name = FileTail(seqfile, TRUE);	}      else	{	  if (setname != NULL) 	    Die("Oops. Wait. You can't use -n with an alignment database.");	  else if (msa->name != NULL) name = Strdup(msa->name);	  else	    Die("Oops. Wait. I need name annotation on each alignment.\n");	}      Plan7SetName(hmm, name);      free(name);      /* Transfer other information from the alignment to       * the HMM. This typically only works for SELEX format       * alignments, so these things are conditional/optional.       */      if (msa->acc  != NULL) Plan7SetAccession(hmm,   msa->acc);      if (msa->desc != NULL) Plan7SetDescription(hmm, msa->desc);      if (msa->flags & MSA_SET_GA) 	{ hmm->flags |= PLAN7_GA; hmm->ga1 = msa->ga1; hmm->ga2 = msa->ga2; }      if (msa->flags & MSA_SET_TC) 	{ hmm->flags |= PLAN7_TC; hmm->tc1 = msa->tc1; hmm->tc2 = msa->tc2; }      if (msa->flags & MSA_SET_NC) 	{ hmm->flags |= PLAN7_NC; hmm->nc1 = msa->nc1; hmm->nc2 = msa->nc2; }      /* Record some other miscellaneous information in the HMM,       * like how/when we built it.       */      Plan7ComlogAppend(hmm, argc, argv);      Plan7SetCtime(hmm);      hmm->nseq = msa->nseq;      printf("done. [%s]\n", hmm->name);          /* Print information for the user       */      printf("\nConstructed a profile HMM (length %d)\n", hmm->M);      PrintPlan7Stats(stdout, hmm, dsq, msa->nseq, tr);       printf("\n");      /* Configure the model for chosen algorithm       */      printf("%-40s ... ", "Finalizing model configuration");      fflush(stdout);      switch (cfg_strategy) {      case P7_BASE_CONFIG:  Plan7GlobalConfig(hmm);              break;      case P7_SW_CONFIG:    Plan7SWConfig(hmm, swentry, swexit); break;      case P7_LS_CONFIG:    Plan7LSConfig(hmm);                  break;      case P7_FS_CONFIG:    Plan7FSConfig(hmm, swentry, swexit); break;      default:              Die("bogus configuration choice");      }      printf("done.\n");      /* Save new HMM to disk: open a file for appending or writing.       */      printf("%-40s ... ", "Saving model to file");      fflush(stdout);      if (do_binary) WriteBinHMM(hmmfp, hmm);      else           WriteAscHMM(hmmfp, hmm);      printf("done.\n");				/* the annotated alignment may be resaved */      if (alignfp != NULL) 	{	  MSA    *new_msa;	  SQINFO *sqinfo; 	  printf("%-40s ... ", "Saving annotated alignment");	  fflush(stdout);	  sqinfo  = MSAToSqinfo(msa);	  new_msa = P7Traces2Alignment(dsq, sqinfo, msa->wgt, msa->nseq, 				       hmm->M, tr, FALSE);	  WriteStockholm(alignfp, new_msa);	  MSAFree(new_msa);	  for (idx = 0; idx < msa->nseq; idx++)	    FreeSequence(NULL, &(sqinfo[idx]));	  free(sqinfo);	  printf("done.\n");	}      /* Verbose output; show scores for each sequence       */      if (verbose)	print_all_scores(stdout, hmm, dsq, msa, tr);      /* Clean up before moving on to next alignment       */      for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]);      free(tr);      FreePlan7(hmm);      MSAFree(msa);      Free2DArray((void **) dsq, msa->nseq);       fflush(hmmfp);      if (cfp != NULL)     fflush(cfp);      if (alignfp != NULL) fflush(alignfp);      puts("//\n");      nali++;    }  /* Clean up and exit   */  MSAFileClose(afp);  fclose(hmmfp);  if (cfp != NULL)     fclose(cfp);  if (alignfp != NULL) fclose(alignfp);  P7FreePrior(pri);  SqdClean();  return 0;}/* Function: print_all_scores() *  * Purpose:  For each training sequence, print its score under *           the final model. *            * Args:     fp   - where to print the output (usu. stdout) *           hmm  - newly constructed HMM, with prob's. *           dsq  - digitized unaligned training sequences. *           msa  - alignment and associated info  *           tr   - array of tracebacks *            * Return:   (void)                          */static voidprint_all_scores(FILE *fp, struct plan7_s *hmm,		 char **dsq, MSA *msa, struct p7trace_s **tr){  int idx;			/* counter for sequences */				/* make sure model scores are ready */  P7Logoddsify(hmm, TRUE);				/* header */  fputs("**\n", fp);  fputs("Individual training sequence scores:\n", fp);				/* score for each sequence */  for (idx = 0; idx < msa->nseq; idx++)     {      fprintf(fp, "%7.2f %-12s %s\n", 	      P7TraceScore(hmm, dsq[idx], tr[idx]),	      msa->sqname[idx],	      (MSAGetSeqDescription(msa,idx) != NULL) ? 	       MSAGetSeqDescription(msa,idx) : "");      P7PrintTrace(fp, tr[idx], hmm, dsq[idx]);    }  fputs("\n", fp);}/* Function: save_countvectors() *  * Purpose:  Save emission/transition count vectors to a file. *           Used for gathering the data on which to train a *           prior (e.g. mixture Dirichlet, etc.) *            *           The format of the file is one vector per line: *           M <f> <f> ...: 20 match emission counts in order AC..WY. *                          followed by two chars of CS, CA annotation. *           I <f> <f> ...: 20 insert emission counts in order AC..WY. *                          followed by two chars of CS, CA annotation. *           T <f> <f> ...: 7 transition counts in order TMM, TMI, TMD,  *                            TIM, TII, TDM, TDD. (see structs.h) *                            followed by four chars of structure *                            annotation: CS, CS of M+1; CA, CA of M+1.  *            * Args:     cfp    - open counts file  *           name   - name of alignment or HMM to associate with these vectors *           hmm    - counts-based HMM */static voidsave_countvectors(FILE *cfp, char *name, struct plan7_s *hmm){  int k, x;				/* match emission vectors */  for (k = 1; k <= hmm->M; k++)    {      fputs("M ", cfp);      for (x = 0; x < Alphabet_size; x++)	fprintf(cfp, "%8.2f ", hmm->mat[k][x]);      fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);      if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)	fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]);      else	fputs("- -", cfp);      fputs("\n", cfp);    }				/* insert emission vectors */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -