📄 hmmbuild.c
字号:
/* Make alignment upper case, because some symbol counting * things are case-sensitive. */ for (idx = 0; idx < msa->nseq; idx++) s2upper(msa->aseq[idx]); /* Set up the alphabet globals: * either already set by --amino or --nucleic, or * we guess based on the first alignment we see */ if (Alphabet_type == hmmNOTSETYET) DetermineAlphabet(msa->aseq, msa->nseq); /* Do some initialization the first time through. * This code must be delayed until after we've seen the * first alignment, because we have to see the alphabet type first */ if (nali == 0) { /* Set up Dirichlet priors */ if (prifile == NULL) pri = P7DefaultPrior(); else pri = P7ReadPrior(prifile); if (pamfile != NULL) PAMPrior(pamfile, pri, pamwgt); /* Set up the null/random seq model */ if (rndfile == NULL) P7DefaultNullModel(randomseq, &p1); else P7ReadNullModel(rndfile, randomseq, &p1); } /* Prepare unaligned digitized sequences for internal use */ DigitizeAlignment(msa, &dsq); /* In some respects we treat DNA more crudely for now; * for example, we can't do eff seq #, because it's * calibrated for protein. */ if (Alphabet_type == hmmNUCLEIC) do_eff = FALSE; /* Determine "effective sequence number". * The BlosumWeights() routine is now an efficient O(N) * memory clustering algorithm that doesn't blow up on, * say, Pfam's GP120 alignment (13000+ sequences) */ eff_nseq = (float) msa->nseq; if (do_eff) { float *wgt; printf("%-40s ... ", "Determining effective sequence number"); fflush(stdout); /* dummy weights array to feed BlosumWeights*/ wgt = MallocOrDie(sizeof(float) * msa->nseq); BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt); eff_nseq = FSum(wgt, msa->nseq); free(wgt); printf("done. [%.0f]\n", eff_nseq); } /* Weight the sequences (optional), */ if (w_strategy == WGT_GSC || w_strategy == WGT_BLOSUM || w_strategy == WGT_VORONOI || w_strategy == WGT_PB) { printf("%-40s ... ", "Weighting sequences heuristically"); fflush(stdout); if (w_strategy != WGT_PB && msa->nseq >= pbswitch) { printf("[big alignment! doing PB]... "); PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); } else if (w_strategy == WGT_GSC) GSCWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); else if (w_strategy == WGT_BLOSUM) BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, msa->wgt); else if (w_strategy == WGT_PB) PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); else if (w_strategy == WGT_VORONOI) VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); printf("done.\n"); } /* Set the effective sequence number (if do_eff is FALSE, eff_nseq * was set to nseq). */ FNorm(msa->wgt, msa->nseq); FScale(msa->wgt, msa->nseq, eff_nseq); /* Build a model architecture. * If we're not doing MD or ME, that's all we need to do. * We get an allocated, counts-based HMM back. * * Because the architecture algorithms are allowed to change * gap characters in the alignment, we have to calculate the * alignment checksum before we enter the algorithms. */ printf("%-40s ... ", "Constructing model architecture"); fflush(stdout); checksum = GCGMultchecksum(msa->aseq, msa->nseq); if (c_strategy == P7_FAST_CONSTRUCTION) P7Fastmodelmaker(msa, dsq, gapmax, &hmm, &tr); else if (c_strategy == P7_HAND_CONSTRUCTION) P7Handmodelmaker(msa, dsq, &hmm, &tr); else P7Maxmodelmaker(msa, dsq, gapmax, pri, randomseq, p1, archpri, &hmm, &tr); hmm->checksum = checksum; printf("done.\n"); /* Save the count vectors if asked. Used primarily for * making the data files for training priors. */ if (cfile != NULL) { printf("%-40s ... ", "Saving count vector file"); fflush(stdout); save_countvectors(cfp, (msa->name != NULL ? msa->name : "-"), hmm); printf("done. [%s]\n", cfile); } /* Record the null model in the HMM; * add prior contributions in pseudocounts and renormalize. */ printf("%-40s ... ", "Converting counts to probabilities"); fflush(stdout); Plan7SetNullModel(hmm, randomseq, p1); P7PriorifyHMM(hmm, pri); printf("done.\n"); /* Model configuration, temporary. * hmmbuild assumes that it's given an alignment of single domains, * and the alignment may contain fragments. So, for the purpose of * scoring the sequences (or, optionally, MD/ME weighting), * configure the model into hmmsw mode. Later we'll * configure the model according to how the user wants to * use it. */ Plan7SWConfig(hmm, 0.5, 0.5); /* Do model-dependent "weighting" strategies. */ if (w_strategy == WGT_ME) { printf("\n%-40s ...\n", "Maximum entropy weighting, iterative"); maximum_entropy(hmm, dsq, msa, eff_nseq, pri, tr); printf("----------------------------------------------\n\n"); } /* Give the model a name. * We deal with this differently depending on whether * we're in an alignment database or a single alignment. * * If a single alignment, priority is: * 1. Use -n <name> if set. * 2. Use msa->name (avail in Stockholm or SELEX formats only) * 3. If all else fails, use alignment file name without * filename extension (e.g. "globins.slx" gets named "globins" * * If a multiple MSA database (e.g. Stockholm/Pfam), * only msa->name is applied. -n is not allowed. * if msa->name is unavailable, or -n was used, * a fatal error is thrown. * * Because we can't tell whether we've got more than one * alignment 'til we're on the second one, these fatal errors * only happen after the first HMM has already been built. * Oh well. */ printf("%-40s ... ", "Setting model name, etc."); fflush(stdout); if (nali == 0) /* first (only?) HMM in file: */ { if (setname != NULL) name = Strdup(setname); else if (msa->name != NULL) name = Strdup(msa->name); else name = FileTail(seqfile, TRUE); } else { if (setname != NULL) Die("Oops. Wait. You can't use -n with an alignment database."); else if (msa->name != NULL) name = Strdup(msa->name); else Die("Oops. Wait. I need name annotation on each alignment.\n"); } Plan7SetName(hmm, name); free(name); /* Transfer other information from the alignment to * the HMM. This typically only works for SELEX format * alignments, so these things are conditional/optional. */ if (msa->acc != NULL) Plan7SetAccession(hmm, msa->acc); if (msa->desc != NULL) Plan7SetDescription(hmm, msa->desc); if (msa->flags & MSA_SET_GA) { hmm->flags |= PLAN7_GA; hmm->ga1 = msa->ga1; hmm->ga2 = msa->ga2; } if (msa->flags & MSA_SET_TC) { hmm->flags |= PLAN7_TC; hmm->tc1 = msa->tc1; hmm->tc2 = msa->tc2; } if (msa->flags & MSA_SET_NC) { hmm->flags |= PLAN7_NC; hmm->nc1 = msa->nc1; hmm->nc2 = msa->nc2; } /* Record some other miscellaneous information in the HMM, * like how/when we built it. */ Plan7ComlogAppend(hmm, argc, argv); Plan7SetCtime(hmm); hmm->nseq = msa->nseq; printf("done. [%s]\n", hmm->name); /* Print information for the user */ printf("\nConstructed a profile HMM (length %d)\n", hmm->M); PrintPlan7Stats(stdout, hmm, dsq, msa->nseq, tr); printf("\n"); /* Configure the model for chosen algorithm */ printf("%-40s ... ", "Finalizing model configuration"); fflush(stdout); switch (cfg_strategy) { case P7_BASE_CONFIG: Plan7GlobalConfig(hmm); break; case P7_SW_CONFIG: Plan7SWConfig(hmm, swentry, swexit); break; case P7_LS_CONFIG: Plan7LSConfig(hmm); break; case P7_FS_CONFIG: Plan7FSConfig(hmm, swentry, swexit); break; default: Die("bogus configuration choice"); } printf("done.\n"); /* Save new HMM to disk: open a file for appending or writing. */ printf("%-40s ... ", "Saving model to file"); fflush(stdout); if (do_binary) WriteBinHMM(hmmfp, hmm); else WriteAscHMM(hmmfp, hmm); printf("done.\n"); /* the annotated alignment may be resaved */ if (alignfp != NULL) { MSA *new_msa; SQINFO *sqinfo; printf("%-40s ... ", "Saving annotated alignment"); fflush(stdout); sqinfo = MSAToSqinfo(msa); new_msa = P7Traces2Alignment(dsq, sqinfo, msa->wgt, msa->nseq, hmm->M, tr, FALSE); WriteStockholm(alignfp, new_msa); MSAFree(new_msa); for (idx = 0; idx < msa->nseq; idx++) FreeSequence(NULL, &(sqinfo[idx])); free(sqinfo); printf("done.\n"); } /* Verbose output; show scores for each sequence */ if (verbose) print_all_scores(stdout, hmm, dsq, msa, tr); /* Clean up before moving on to next alignment */ for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]); free(tr); FreePlan7(hmm); MSAFree(msa); Free2DArray((void **) dsq, msa->nseq); fflush(hmmfp); if (cfp != NULL) fflush(cfp); if (alignfp != NULL) fflush(alignfp); puts("//\n"); nali++; } /* Clean up and exit */ MSAFileClose(afp); fclose(hmmfp); if (cfp != NULL) fclose(cfp); if (alignfp != NULL) fclose(alignfp); P7FreePrior(pri); SqdClean(); return 0;}/* Function: print_all_scores() * * Purpose: For each training sequence, print its score under * the final model. * * Args: fp - where to print the output (usu. stdout) * hmm - newly constructed HMM, with prob's. * dsq - digitized unaligned training sequences. * msa - alignment and associated info * tr - array of tracebacks * * Return: (void) */static voidprint_all_scores(FILE *fp, struct plan7_s *hmm, char **dsq, MSA *msa, struct p7trace_s **tr){ int idx; /* counter for sequences */ /* make sure model scores are ready */ P7Logoddsify(hmm, TRUE); /* header */ fputs("**\n", fp); fputs("Individual training sequence scores:\n", fp); /* score for each sequence */ for (idx = 0; idx < msa->nseq; idx++) { fprintf(fp, "%7.2f %-12s %s\n", P7TraceScore(hmm, dsq[idx], tr[idx]), msa->sqname[idx], (MSAGetSeqDescription(msa,idx) != NULL) ? MSAGetSeqDescription(msa,idx) : ""); P7PrintTrace(fp, tr[idx], hmm, dsq[idx]); } fputs("\n", fp);}/* Function: save_countvectors() * * Purpose: Save emission/transition count vectors to a file. * Used for gathering the data on which to train a * prior (e.g. mixture Dirichlet, etc.) * * The format of the file is one vector per line: * M <f> <f> ...: 20 match emission counts in order AC..WY. * followed by two chars of CS, CA annotation. * I <f> <f> ...: 20 insert emission counts in order AC..WY. * followed by two chars of CS, CA annotation. * T <f> <f> ...: 7 transition counts in order TMM, TMI, TMD, * TIM, TII, TDM, TDD. (see structs.h) * followed by four chars of structure * annotation: CS, CS of M+1; CA, CA of M+1. * * Args: cfp - open counts file * name - name of alignment or HMM to associate with these vectors * hmm - counts-based HMM */static voidsave_countvectors(FILE *cfp, char *name, struct plan7_s *hmm){ int k, x; /* match emission vectors */ for (k = 1; k <= hmm->M; k++) { fputs("M ", cfp); for (x = 0; x < Alphabet_size; x++) fprintf(cfp, "%8.2f ", hmm->mat[k][x]); fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k); if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA) fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]); else fputs("- -", cfp); fputs("\n", cfp); } /* insert emission vectors */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -