📄 hmmbuild.c
字号:
for (k = 1; k < hmm->M; k++) { fputs("I ", cfp); for (x = 0; x < Alphabet_size; x++) fprintf(cfp, "%8.2f ", hmm->ins[k][x]); fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k); if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA) fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]); else fputs("- -", cfp); fputs("\n", cfp); } /* transition vectors */ for (k = 1; k < hmm->M; k++) { fputs("T ", cfp); for (x = 0; x < 7; x++) fprintf(cfp, "%8.2f ", hmm->t[k][x]); fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k); if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA) fprintf(cfp, "%c %c %c %c", hmm->cs[k], hmm->cs[k+1], hmm->ca[k], hmm->ca[k+1]); else fputs("- -", cfp); fputs("\n", cfp); }}/* Function: position_average_score() * Date: Wed Dec 31 09:36:35 1997 [StL] * * Purpose: Calculate scores from tracebacks, keeping them * in a position specific array. The final array * is normalized position-specifically too, according * to how many sequences contributed data to this * position. Used for compensating for sequence * fragments in ME and MD score optimization. * Very much ad hoc. * * Code related to (derived from) TraceScore(). * * Args: hmm - HMM structure, scores valid * dsq - digitized unaligned sequences * wgt - weights on the sequences * nseq - number of sequences * tr - array of nseq tracebacks that aligns each dsq to hmm * pernode - RETURN: [0]1..M array of position-specific avg scores * ret_avg - RETURN: overall average full-length, one-domain score * * Return: 1 on success, 0 on failure. * pernode is malloc'ed [0]1..M by CALLER and filled here. */static voidposition_average_score(struct plan7_s *hmm, char **dsq, float *wgt, int nseq, struct p7trace_s **tr, float *pernode, float *ret_avg){ int pos; /* position in seq */ int sym; int tpos; /* position in trace/state sequence */ float *counts; /* counts at each position */ float avg; /* RETURN: average overall */ int k; /* counter for model position */ int idx; /* counter for sequence number */ /* Allocations */ counts = MallocOrDie ((hmm->M+1) * sizeof(float)); FSet(pernode, hmm->M+1, 0.); FSet(counts, hmm->M+1, 0.); /* Loop over traces, accumulate weighted scores per position */ for (idx = 0; idx < nseq; idx++) for (tpos = 0; tpos < tr[idx]->tlen; tpos++) { pos = tr[idx]->pos[tpos]; sym = (int) dsq[idx][tr[idx]->pos[tpos]]; k = tr[idx]->nodeidx[tpos]; /* Counts: how many times did we use this model position 1..M? * (weighted) */ if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD) counts[k] += wgt[idx]; /* Emission scores. */ if (tr[idx]->statetype[tpos] == STM) pernode[k] += wgt[idx] * Scorify(hmm->msc[sym][k]); else if (tr[idx]->statetype[tpos] == STI) pernode[k] += wgt[idx] * Scorify(hmm->isc[sym][k]); /* Transition scores. */ if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD || tr[idx]->statetype[tpos] == STI) pernode[k] += wgt[idx] * Scorify(TransitionScoreLookup(hmm, tr[idx]->statetype[tpos], tr[idx]->nodeidx[tpos], tr[idx]->statetype[tpos+1],tr[idx]->nodeidx[tpos+1])); } /* Divide accumulated scores by accumulated weighted counts */ avg = 0.; for (k = 1; k <= hmm->M; k++) { pernode[k] /= counts[k]; avg += pernode[k]; } free(counts); *ret_avg = avg; return;}/* Function: frag_trace_score() * Date: SRE, Wed Dec 31 10:03:47 1997 [StL] * * Purpose: Allow MD/ME optimization to be used for alignments * that include fragments and multihits -- estimate a full-length * per-domain score. * * * * Return: "corrected" score. */static floatfrag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, float *pernode, float expected){ float sc; /* corrected score */ float fragexp; /* expected score for a trace like this */ int tpos; /* position in trace */ /* get uncorrected score */ sc = P7TraceScore(hmm, dsq, tr); /* calc expected score for trace like this */ fragexp = 0.; for (tpos = 0; tpos < tr->tlen; tpos++) if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD) fragexp += pernode[tr->nodeidx[tpos]]; /* correct for multihits */ fragexp /= (float) TraceDomainNumber(tr); /* extrapolate to full-length, one-hit score */ sc = sc * expected / fragexp; return sc;}/* Function: maximum_entropy() * Date: SRE, Fri Jan 2 10:56:00 1998 [StL] * * Purpose: Optimizes a model according to maximum entropy weighting. * See Krogh and Mitchison (1995). * * [Actually, we do minimum relative entropy, rather than * maximum entropy. Same thing, though we refer to "ME" * weights and models. The optimization is a steepest * descents minimization of the relative entropy.] * * Expects to be called shortly after a Maxmodelmaker() * or Handmodelmaker(), so that both a new model architecture * (with MAP parameters) and fake tracebacks are available. * * Prints a summary of optimization progress to stdout. * * Args: hmm - model. allocated, set with initial MAP parameters. * dsq - dealigned digitized seqs the model is based on * ainfo - extra info for aseqs * nseq - number of aseqs * eff_nseq- effective sequence number; weights normalize up to this. * prior - prior distributions for parameterizing model * tr - array of fake traces for each sequence * * Return: (void) * hmm changed to an ME HMM * ainfo changed, contains ME weights */static voidmaximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa, float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr){ float *wgt; /* current best set of ME weights */ float *new_wgt; /* new set of ME weights to try */ float *sc; /* log-odds score of each sequence */ float *grad; /* gradient */ float epsilon; /* steepness of descent */ float relative_entropy; /* current best relative entropy */ float new_entropy; /* relative entropy at new weights */ float last_new_entropy; /* last new_entropy we calc'ed */ float use_epsilon; /* current epsilon value in use */ int idx; /* counter over sequences */ int i1, i2; /* counters for iterations */ float converge_criterion; float minw, maxw; /* min, max weight */ int posw, highw; /* number of positive weights */ float mins, maxs, avgs; /* min, max, avg score */ float *pernode; /* expected score per node of HMM */ float expscore; /* expected score of complete HMM */ int max_iter; /* bulletproof against infinite loop bugs */ epsilon = 0.2; /* works fine */ max_iter = 666; /* Allocations */ sc = MallocOrDie (sizeof(float) * msa->nseq); wgt = MallocOrDie (sizeof(float) * msa->nseq); new_wgt = MallocOrDie (sizeof(float) * msa->nseq); grad = MallocOrDie (sizeof(float) * msa->nseq); pernode = MallocOrDie (sizeof(float) * (hmm->M+1)); /* Initialization. Start with all weights == 1.0. * Find relative entropy and gradient. */ Plan7SWConfig(hmm, 0.5, 0.5); P7Logoddsify(hmm, TRUE); FSet(wgt, msa->nseq, 1.0); position_average_score(hmm, dsq, wgt, msa->nseq, tr, pernode,&expscore); for (idx = 0; idx < msa->nseq; idx++) sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore); relative_entropy = FSum(sc, msa->nseq) / (float) msa->nseq; for (idx = 0; idx < msa->nseq; idx++) grad[idx] = relative_entropy - sc[idx]; printf("iter avg-sc min-sc max-sc min-wgt max-wgt +wgt ++wgt rel.ent convergence\n"); printf("---- ------ ------ ------ ------- ------- ---- ----- ------- -----------\n"); mins = maxs = avgs = sc[0]; for (idx = 1; idx < msa->nseq; idx++) { if (sc[idx] < mins) mins = sc[idx]; if (sc[idx] > maxs) maxs = sc[idx]; avgs += sc[idx]; } avgs /= (float) msa->nseq; printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8s\n", 0, avgs, mins, maxs, 1.0, 1.0, msa->nseq, 0, relative_entropy, "-"); /* Steepest descents optimization; * iterate until relative entropy converges. */ i1 = 0; while (++i1 < max_iter) { /* Gradient gives us a line of steepest descents. * (Roughly speaking, anyway. We actually have a constraint * that weights are nonnegative and normalized, and the * gradient doesn't take these into account.) * Look along this line, a distance of epsilon * gradient: * if new point is better, accept; if new point is worse, * move back along the line by half the distance and re-evaluate. */ use_epsilon = epsilon; new_entropy = relative_entropy + 1.0; /* just ensure new > old */ i2 = 0; while (new_entropy > relative_entropy && ++i2 < max_iter) { last_new_entropy = new_entropy; /* find a new point in weight space */ for (idx = 0; idx < msa->nseq; idx++) { new_wgt[idx] = wgt[idx] + use_epsilon * grad[idx]; if (new_wgt[idx] < 0.) new_wgt[idx] = 0.0; } FNorm(new_wgt, msa->nseq); FScale(new_wgt, msa->nseq, (float) msa->nseq); /* Make new HMM using these weights */ ZeroPlan7(hmm); for (idx = 0; idx < msa->nseq; idx++) P7TraceCount(hmm, dsq[idx], new_wgt[idx], tr[idx]); P7PriorifyHMM(hmm, prior); /* Evaluate new point */ Plan7SWConfig(hmm, 0.5, 0.5); P7Logoddsify(hmm, TRUE); position_average_score(hmm, dsq, new_wgt, msa->nseq, tr, pernode, &expscore); for (idx = 0; idx < msa->nseq; idx++) sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore); new_entropy = FDot(sc, new_wgt, msa->nseq) / (float) msa->nseq; use_epsilon /= 2.0; /* Failsafe: we're not converging. Set epsilon to zero, * do one more round. */ if (use_epsilon < 1e-6) use_epsilon = 0.0; if (use_epsilon == 0.0) break; /* Failsafe: avoid infinite loops. Sometimes the new entropy converges without ever being better than the previous point, probably as a result of minor roundoff error. */ if (last_new_entropy == new_entropy) break; } if (i2 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n"); /* Evaluate convergence before accepting the new weights; * then, accept the new point and evaluate the gradient there. */ converge_criterion = fabs((relative_entropy-new_entropy)/relative_entropy); relative_entropy = new_entropy; FCopy(wgt, new_wgt, msa->nseq); for (idx = 0; idx < msa->nseq; idx++) grad[idx] = relative_entropy - sc[idx]; /* Print some statistics about this iteration */ mins = maxs = avgs = sc[0]; minw = maxw = wgt[0]; posw = (wgt[0] > 0.0) ? 1 : 0; highw = (wgt[0] > 1.0) ? 1 : 0; for (idx = 1; idx < msa->nseq; idx++) { if (sc[idx] < mins) mins = sc[idx]; if (sc[idx] > maxs) maxs = sc[idx]; if (wgt[idx] < minw) minw = wgt[idx]; if (wgt[idx] > maxw) maxw = wgt[idx]; if (wgt[idx] > 0.0) posw++; if (wgt[idx] > 1.0) highw++; avgs += sc[idx]; } avgs /= (float) msa->nseq; printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8.5f\n", i1, avgs, mins, maxs, minw, maxw, posw, highw, relative_entropy, converge_criterion); if (converge_criterion < 1e-5) break; } if (i1 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n"); /* Renormalize weights to sum to eff_nseq, and save. */ FNorm(wgt, msa->nseq); FScale(wgt, msa->nseq, (float) eff_nseq); FCopy(msa->wgt, wgt, msa->nseq); /* Make final HMM using these adjusted weights */ ZeroPlan7(hmm); for (idx = 0; idx < msa->nseq; idx++) P7TraceCount(hmm, dsq[idx], wgt[idx], tr[idx]); P7PriorifyHMM(hmm, prior); /* Cleanup and return */ free(pernode); free(new_wgt); free(grad); free(wgt); free(sc); return;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -